Skip to content

Commit

Permalink
Merge pull request #20274 from pdostal/ssh_key_checking
Browse files Browse the repository at this point in the history
PC: Fixup ssh host key validation
  • Loading branch information
asmorodskyi authored Oct 9, 2024
2 parents 90f046c + bde2966 commit b141881
Show file tree
Hide file tree
Showing 15 changed files with 58 additions and 44 deletions.
7 changes: 7 additions & 0 deletions data/publiccloud/ssh_config_sap
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
StrictHostKeyChecking no
PasswordAuthentication no
UserKnownHostsFile /dev/null
IdentityFile %SSH_KEY%
HostKeyAlgorithms +ssh-rsa
LogLevel DEBUG3

37 changes: 27 additions & 10 deletions lib/publiccloud/instance.pm
Original file line number Diff line number Diff line change
Expand Up @@ -350,7 +350,7 @@ sub wait_for_guestregister {

=head2 wait_for_ssh
wait_for_ssh([timeout => 600] [, proceed_on_failure => 0] [, ...])
wait_for_ssh([timeout => 600] [, proceed_on_failure => 0] [, scan_ssh_host_key => 0] [, ...])
When a remote pc instance starting, by default wait_stop param.=0(false) and
this routine checks until the SSH port of the remote instance is reachable and open.
Expand All @@ -364,6 +364,11 @@ Parameters:
timeout => total wait timeout; default: 600.
wait_stop => If true waits for ssh port to become unreachable, if false waits for ssh reachable; default: false.
proceed_on_failure => in case of fail, if false exit test with error, if true let calling code to continue; default: wait_stop.
scan_ssh_host_key => If true we will rescan the SSH host key
This will be true when:
* SUT changes it's public IP address
* SUT regenerates it's SSH host keys
(e.g. when cloud-init state is cleared)
username => default: username().
public_ip => default: public_ip().
systemup_check => If true, checks if the system is up too, instead of just checking the ssh port; default: !wait_stop.
Expand All @@ -379,6 +384,7 @@ sub wait_for_ssh {
# Input parameters, see description in above head2 - Parameters section:
$args{timeout} = get_var('PUBLIC_CLOUD_SSH_TIMEOUT', $args{timeout} // 600);
$args{wait_stop} //= 0;
$args{scan_ssh_host_key} //= 0;
$args{proceed_on_failure} //= $args{wait_stop};
$args{systemup_check} //= not $args{wait_stop};
$args{logs} //= 1;
Expand Down Expand Up @@ -412,15 +418,14 @@ sub wait_for_ssh {
} # endif

# check also remote system is up and running:
# SSH host key is not checked and master socket is not used
my $retry = 0; # count retries of unexpected sysout
if ($args{systemup_check} and isok($exit_code)) {
# Install server's ssh publicckeys to prevent authentication interactions
# or instance address changes during VM reboots.
script_run("ssh-keyscan $args{public_ip} | tee -a ~/.ssh/known_hosts /home/$testapi::username/.ssh/known_hosts");
my $ssh_opts = $self->ssh_opts() . ' -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ControlPath=none';
while (($duration = time() - $start_time) < $args{timeout}) {
# timeout recalculated removing consumed time until now
# We don't support password authentication so it would just block the terminal
$sysout = $self->ssh_script_output(cmd => 'sudo systemctl is-system-running',
$sysout = $self->ssh_script_output(cmd => 'sudo systemctl is-system-running', ssh_opts => $ssh_opts,
timeout => $args{timeout} - $duration, proceed_on_failure => 1, username => $args{username});
# result check
if ($sysout =~ m/initializing|starting/) { # still starting
Expand All @@ -434,7 +439,7 @@ sub wait_for_ssh {
elsif ($sysout =~ m/degraded/) { # up but with failed services to collect
$exit_code = 0;
$sysout .= "\nSystem booted, but some services failed:\n" .
$self->ssh_script_output(cmd => 'sudo systemctl --failed',
$self->ssh_script_output(cmd => 'sudo systemctl --failed', ssh_opts => $ssh_opts,
proceed_on_failure => 1, username => $args{username});
last;
}
Expand All @@ -450,6 +455,16 @@ sub wait_for_ssh {
sleep $delay;
} # end loop

if ($args{scan_ssh_host_key}) {
record_info('RESCAN', 'Rescanning SSH host key');
# Install server's ssh publicckeys to prevent authentication interactions
# or instance address changes during VM reboots.
script_run("ssh-keyscan $args{public_ip} | tee ~/.ssh/known_hosts /home/$testapi::username/.ssh/known_hosts");
}

# Finally make sure that SSH works
$self->ssh_script_retry(cmd => "true", username => $args{username}, timeout => 90, retry => 5, delay => 3);

# Log upload
if (!get_var('PUBLIC_CLOUD_SLES4SAP') and $args{logs}) {
#Exclude 'mr_test/saptune' test case as it will introduce random softreboot failures.
Expand Down Expand Up @@ -492,7 +507,7 @@ sub isok {

=head2 softreboot
($shutdown_time, $bootup_time) = softreboot([timeout => 600]);
($shutdown_time, $bootup_time) = softreboot([timeout => 600] [, scan_ssh_host_key => ?]);
Does a softreboot of the instance by running the command C<shutdown -r>.
Return an array of two values, first one is the time till the instance isn't
Expand All @@ -502,6 +517,7 @@ reachable anymore. The second one is the estimated bootup time.
sub softreboot {
my ($self, %args) = @_;
$args{timeout} //= 600;
$args{scan_ssh_host_key} //= 0;
$args{username} //= $self->username();
# see detailed explanation inside wait_for_ssh

Expand Down Expand Up @@ -532,7 +548,7 @@ sub softreboot {

my $shutdown_time = time() - $start_time;
die("Waiting for system down failed!") unless ($shutdown_time < $args{timeout});
my $bootup_time = $self->wait_for_ssh(timeout => $args{timeout} - $shutdown_time, username => $args{username});
my $bootup_time = $self->wait_for_ssh(timeout => $args{timeout} - $shutdown_time, username => $args{username}, scan_ssh_host_key => $args{scan_ssh_host_key});

# ensure the tunnel-console is healthy, usefuly to early detect possible issues with the serial terminal
assert_script_run("true", fail_message => "console is broken");
Expand Down Expand Up @@ -563,7 +579,7 @@ sub stop {

=head2 start
start([timeout => ?]);
start([timeout => ?] [, scan_ssh_host_key => ?]);
Start the instance and wait for the system to be up.
Returns the number of seconds till the system up and running.
Expand All @@ -572,8 +588,9 @@ Returns the number of seconds till the system up and running.
sub start {
my ($self, %args) = @_;
$args{timeout} //= 600;
$args{scan_ssh_host_key} //= 0;
$self->provider->start_instance($self, @_);
return $self->wait_for_ssh(timeout => $args{timeout});
return $self->wait_for_ssh(timeout => $args{timeout}, scan_ssh_host_key => $args{scan_ssh_host_key});
}

=head2 get_state
Expand Down
8 changes: 6 additions & 2 deletions lib/publiccloud/provider.pm
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,11 @@ Creates ~/.ssh/config file with all the common ssh client settings

sub place_ssh_config {
# configure ssh client
my $ssh_config_url = data_url('publiccloud/ssh_config');
# ssh will be configured by a ~/.ssh/config file, the config file come from a template.
# By default the template is in publiccloud/ssh_config data directory.
# The user can overwrite the template with PUBLIC_CLOUD_SSH_CONFIG variable.
# From now on all ssh calls will use this configuration file.
my $ssh_config_url = data_url(get_var('PUBLIC_CLOUD_SSH_CONFIG', 'publiccloud/ssh_config'));
assert_script_run("curl $ssh_config_url -o ~/.ssh/config");
file_content_replace("~/.ssh/config", "%SSH_KEY%" => get_ssh_private_key_path());
}
Expand Down Expand Up @@ -365,7 +369,7 @@ sub create_instances {
record_info("INSTANCE", $instance->{instance_id});
if ($args{check_connectivity}) {
$instance->wait_for_ssh(timeout => $args{timeout},
proceed_on_failure => $args{proceed_on_failure});
proceed_on_failure => $args{proceed_on_failure}, scan_ssh_host_key => 1);
}
# check guestregister conditional, default yes:
$instance->wait_for_guestregister() if ($args{check_guestregister});
Expand Down
6 changes: 3 additions & 3 deletions lib/sles4sap_publiccloud.pm
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ sub run_cmd {
delete($args{timeout});
delete($args{runas});

$self->{my_instance}->wait_for_ssh(timeout => $timeout);
$self->{my_instance}->wait_for_ssh(timeout => $timeout, scan_ssh_host_key => 1);
my $out = $self->{my_instance}->run_ssh_command(cmd => "sudo $cmd", timeout => $timeout, %args);
record_info("$title output - $self->{my_instance}->{instance_id}", $out) unless ($timeout == 0 or $args{quiet} or $args{rc_only});
return $out;
Expand Down Expand Up @@ -385,7 +385,7 @@ sub stop_hana {
# Crash needs to be executed as root and wait for host reboot

# Ensure the remote node is in a normal state before to trigger the crash
$self->{my_instance}->wait_for_ssh(timeout => $timeout);
$self->{my_instance}->wait_for_ssh(timeout => $timeout, scan_ssh_host_key => 1);

$self->{my_instance}->run_ssh_command(cmd => "sudo su -c sync", timeout => $timeout);

Expand Down Expand Up @@ -414,7 +414,7 @@ sub stop_hana {
record_info("Wait ssh disappear end", "out:" . ($out // 'undefined'));
# wait for node to be ready
wait_hana_node_up($self->{my_instance}, timeout => 900);
$out = $self->{my_instance}->wait_for_ssh(timeout => 900);
$out = $self->{my_instance}->wait_for_ssh(timeout => 900, scan_ssh_host_key => 1);
record_info("Wait ssh is back again", "out:" . ($out // 'undefined'));
}
else {
Expand Down
23 changes: 1 addition & 22 deletions lib/sles4sap_publiccloud_basetest.pm
Original file line number Diff line number Diff line change
Expand Up @@ -112,22 +112,6 @@ if no argument is provided, uses the following defaults:
C<-E /var/tmp/ssh_sut.log>: save logging to B</var/tmp/ssh_sut.log>.
=item *
C<-F none>: do not use SSH configuration files.
=item *
C<-o LogLevel=DEBUG3>: set log level to B<DEBUG3>.
=item *
C<-o PasswordAutentication=no>: do not allow authentication via password.
=item *
C<-i 'get_ssh_private_key_path()'>: use the generated private SSH key
=back
B<Note>: if the method receives an empty string, no SSH options will be set.
Expand All @@ -138,12 +122,7 @@ sub set_cli_ssh_opts {
my ($self, $ssh_opts) = @_;
croak("Expected \$self->{my_instance} is not defined. Check module Description for details")
unless $self->{my_instance};
$ssh_opts //= join(' ',
'-E', '/var/tmp/ssh_sut.log',
'-F', 'none',
'-o', 'LogLevel=DEBUG3',
'-o', 'PasswordAuthentication=no',
'-i', "'" . get_ssh_private_key_path() . "'");
$ssh_opts //= join(' ', '-E', '/var/tmp/ssh_sut.log');
$self->{my_instance}->ssh_opts($ssh_opts);
}

Expand Down
1 change: 1 addition & 0 deletions schedule/sles4sap/publiccloud_hanasr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ vars:
PUBLIC_CLOUD_RESOURCE_GROUP: 'hanasr'
PUBLIC_CLOUD: '1'
TEST_CONTEXT: 'OpenQA::Test::RunArgs'
PUBLIC_CLOUD_SSH_CONFIG: 'publiccloud/ssh_config_sap'
schedule:
- boot/boot_to_desktop
- sles4sap/publiccloud/hana_sr_schedule_deployment
Expand Down
1 change: 1 addition & 0 deletions schedule/sles4sap/sles4sap_gnome_saptune.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ vars:
BOOT_HDD_IMAGE: '1'
NODE_COUNT: '1'
TEST_CONTEXT: 'OpenQA::Test::RunArgs'
PUBLIC_CLOUD_SSH_CONFIG: 'publiccloud/ssh_config_sap'
# Below have to be entered in the OpenQA UI because it doesn't read this YAML
# HDD_1: SLE-%VERSION%-%ARCH%-Build%BUILD%-sles4sap-gnome.qcow2
# START_AFTER_TEST: create_hdd_sles4sap_gnome
Expand Down
1 change: 1 addition & 0 deletions schedule/sles4sap/sles4sap_gnome_saptune_maintenance.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ vars:
# Below have to be entered in the OpenQA UI because it doesn't read this YAML
# HDD_1: SLE-%VERSION%-%ARCH%-Build%BUILD%-sles4sap-gnome.qcow2
# START_AFTER_TEST: create_hdd_sles4sap_gnome
PUBLIC_CLOUD_SSH_CONFIG: 'publiccloud/ssh_config_sap'
schedule:
- boot/boot_to_desktop
- '{{PC_instance_ssh_interactive_start}}'
Expand Down
3 changes: 2 additions & 1 deletion tests/publiccloud/bsc_1205002.pm
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@ sub run {

$provider->start_instance($instance);

$instance->wait_for_ssh();
# The instance changes its public IP address so the key must be rescanned
$instance->wait_for_ssh(scan_ssh_host_key => 1);
$instance->ssh_assert_script_run("echo we can login");
}

Expand Down
6 changes: 4 additions & 2 deletions tests/publiccloud/patch_and_reboot.pm
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,14 @@ sub run {
record_info('UNAME', $args->{my_instance}->ssh_script_output(cmd => 'uname -a'));
$args->{my_instance}->ssh_assert_script_run(cmd => 'rpm -qa > /tmp/rpm-qa.txt');
$args->{my_instance}->upload_log('/tmp/rpm-qa.txt');
$args->{my_instance}->cleanup_cloudinit() if (is_cloudinit_supported);
$args->{my_instance}->softreboot(timeout => get_var('PUBLIC_CLOUD_REBOOT_TIMEOUT', 600));

if (is_cloudinit_supported) {
$args->{my_instance}->cleanup_cloudinit();
$args->{my_instance}->softreboot(timeout => get_var('PUBLIC_CLOUD_REBOOT_TIMEOUT', 600), scan_ssh_host_key => 1);
$args->{my_instance}->check_cloudinit();
permit_root_login($args->{my_instance});
} else {
$args->{my_instance}->softreboot(timeout => get_var('PUBLIC_CLOUD_REBOOT_TIMEOUT', 600));
}
}

Expand Down
1 change: 1 addition & 0 deletions tests/sles4sap/publiccloud/azure_fence_agents_test.pm
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ sub run {
$self->{my_instance} = $instance;
# do not probe VMs that are not part of the cluster
next unless grep(/^$instance->{instance_id}$/, @cluster_nodes);
$instance->wait_for_ssh(scan_ssh_host_key => 1);
my $scp_cmd = join('', 'scp /tmp/bashrc ',
$instance->{username},
'@', $instance->{public_ip},
Expand Down
2 changes: 1 addition & 1 deletion tests/sles4sap/publiccloud/hana_sr_takeover.pm
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ sub run {

# Stop/kill/crash HANA DB and wait till SSH is again available with pacemaker running.
$self->stop_hana(method => $takeover_action);
$self->{my_instance}->wait_for_ssh(username => 'cloudadmin');
$self->{my_instance}->wait_for_ssh(username => 'cloudadmin', scan_ssh_host_key => 1);

# SBD delay is active only after reboot
if ($takeover_action eq 'crash' || $takeover_action eq 'stop') {
Expand Down
2 changes: 1 addition & 1 deletion tests/sles4sap/publiccloud/hana_sr_test_secondary.pm
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ sub run {
$sbd_delay = $self->sbd_delay_formula if $db_action eq 'crash';

$self->stop_hana(method => $db_action);
$self->{my_instance}->wait_for_ssh(username => 'cloudadmin');
$self->{my_instance}->wait_for_ssh(username => 'cloudadmin', scan_ssh_host_key => 1);

# SBD delay is active only after reboot
if ($db_action eq 'crash' || $db_action eq 'stop') {
Expand Down
2 changes: 1 addition & 1 deletion tests/sles4sap/publiccloud/qesap_prevalidate.pm
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ sub run {
$self->{my_instance} = $instance;
my $instance_id = $instance->{'instance_id'};
# Check ssh connection for all hosts
$instance->wait_for_ssh;
$instance->wait_for_ssh(scan_ssh_host_key => 1);

# Skip instances without HANA db or setup without cluster
next if ($instance_id !~ m/vmhana/) or !$ha_enabled;
Expand Down
2 changes: 1 addition & 1 deletion tests/sles4sap/publiccloud/qesap_terraform.pm
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ sub run {
my $expected_hostname = $instance->{instance_id};
$instance->wait_for_ssh();
# Does not fail for some reason.
my $real_hostname = $instance->run_ssh_command(cmd => 'hostname', username => 'cloudadmin');
my $real_hostname = $instance->ssh_script_output(cmd => 'hostname', username => 'cloudadmin');
# We expect hostnames reported by terraform to match the actual hostnames in Azure and GCE
die "Expected hostname $expected_hostname is different than actual hostname [$real_hostname]"
if ((is_azure() || is_gce()) && ($expected_hostname ne $real_hostname));
Expand Down

0 comments on commit b141881

Please sign in to comment.