#!/usr/bin/perl -w ################################################################################ # Copyright (c) 2003 University of Utah Student Computing Labs. # All Rights Reserved. # # Permission to use, copy, modify, and distribute this software and # its documentation for any purpose and without fee is hereby granted, # provided that the above copyright notice appears in all copies and # that both that copyright notice and this permission notice appear # in supporting documentation, and that the name of The University # of Utah not be used in advertising or publicity pertaining to # distribution of the software without specific, written prior # permission. This software is supplied as is without expressed or # implied warranties of any kind. # # Version 2.0.7 # # History # 2004-12-01 Fixed utility stuff (delete|status), added "all" keyword. # 2004-11-12 Fixed space in path bugs. # 2004-10-31 Fixed bug with getting results from execute_command and a space in the # get_submit_command so that it wouldn't kill hung processes. # added status | delete option # 2004-07-20 Added enviroment variable check, more comments, $poll_wait variable # ################################################################################ # Set XGRID_CONTROLLER_HOSTNAME and XGRID_CONTROLLER_PASSWORD before running this script. # WARNING: if you use a dns name, this script will send about 13000 hits to your dns server. # So use IP number or Rendezvous if you can. Or you can set $poll_wait to a higher number below. #Example: xgrid_job_submit_v2.pl workingdir outdir ./command "args 1 2 etc" 1 10 1 15 my $poll_sleep_value = 10; # sleep this number of seconds before starting the loop over. # If your jobs are short, you want to decrease this. # But beware, if this script checks too often, bad things # could happen (in one case my controller was crashed with # had a zillion network connections openned to the computer # running this loop too often)... my $poll_wait = 2; # sleep this number of seconds between status commands my $sleep_time = 1; # sleep this number of seconds between all commands my $stop_xgrid_crashes = 1; # set to 1 if you want to run this script on the controller my $hang_wait_time = 60; # this is the longest this script will wait before killing the xgrid command #----------------------------------------------------------------------# # You might have to edit stuff below... #----------------------------------------------------------------------# my $xgrid = "/usr/bin/xgrid"; # Force buffer flushing $|++; $oldhandle = select ( STDERR ); $|++; select ( $oldhandle ); # check to see if XGRID_CONTROLLER_HOSTNAME and XGRID_CONTROLLER_PASSWORD are set if ( `echo \$XGRID_CONTROLLER_HOSTNAME` eq "\n" or `echo \$XGRID_CONTROLLER_PASSWORD` eq "\n") { die "Please set XGRID_CONTROLLER_HOSTNAME and XGRID_CONTROLLER_PASSWORD environment variables\n"; } $print_usage = 0; if ( defined $ARGV[0] ) { if ( $ARGV[0] eq "delete" or $ARGV[0] eq "status" ) { if ( defined $ARGV[1] ) { ### ## UTILITY STUFF ### if ( $ARGV[1] eq "all" ) { @joblist = ( `xgrid -job list | grep jobIdentifier | awk ' { print \$3 }' | sed 's/\;//g'` ); foreach $i ( @joblist ) { chomp $i; print "Job number $i: $ARGV[0]\n"; my $results = execute_command ($xgrid, "-job $ARGV[0] -id $i", $stop_xgrid_crashes, 1, "/tmp/xgrid_job_tmp"); $results =~ /\s*jobStatus\s*=\s*([A-z]+)\s*;/; print "Job number $i: $ARGV[0]: $1\n" if $ARGV[0] eq "status" ; } } else { if ( defined $ARGV[2] ) { for ( $i = $ARGV[1]; $i <= $ARGV[2]; $i++ ) { print "Job number $i: $ARGV[0]\n"; my $results = execute_command ($xgrid, "-job $ARGV[0] -id $i", $stop_xgrid_crashes, 1, "/tmp/xgrid_job_tmp"); $results =~ /\s*jobStatus\s*=\s*([A-z]+)\s*;/; print "Job number $i: $ARGV[0]: $1\n" if $ARGV[0] eq "status" ; } } else { $print_usage = 1; } } exit; } else { $print_usage = 1; } } } if ( ! defined $ARGV[7] ) { $print_usage = 1; } if ( $print_usage ) { print "Usage: $0 [ range | list ] \ [ | ( ... ) ]\n Example 1: xgrid_job_submit_v5.pl workingdir outdir echo \ \"jobnumber \" 10 1 range 1 15\n Example 2: xgrid_job_submit_v5.pl workingdir outdir echo \"jobnumber \" 10 1 list \ 1 5 9 15\nor\n$0 ( delete | status ) \n"; exit 1; } # Get args my $working_dir = $ARGV[0]; # This is sent to each agent my $output_dir = $ARGV[1]; # Where results and logs are saved my $agent_launcher_script = $ARGV[2]; # This is the first script that runs on the agent my $agent_launcher_args = $ARGV[3]; my $max_submitted_jobs = $ARGV[4]; # This is the maximum number of jobs this script will submit to the xgrid controller my $jobs_per_agent = $ARGV[5]; # This the skip value for the range. It is passed to the agent script as $ARGV[1]. my $list_or_range = $ARGV[6]; my @jobs = (); if ( $list_or_range eq "list" ) { for ( $i = 7 ; $i <= $#ARGV ; $i++ ) { push ( @jobs, $ARGV[$i] ); } } elsif ( $list_or_range eq "range" ) { my $job_start = $ARGV[7]; # The start range my $job_end = $ARGV[8]; # The end range for ( $i = $job_start ; $i <= $job_end ; $i += $jobs_per_agent ) { push ( @jobs, $i ); } } else { die "Paramaters bad\n"; } my $submit_log_location = "$output_dir/submit"; my $status_log_location = "$output_dir/status"; my $results_log_location = "$output_dir/results"; my $delete_log_location = "$output_dir/delete"; sub get_submit_command { my ( $job_number, $jobs_per_agent ) = @_; ## # PUT YOUR XGRID SUBMIT COMMAND HERE # # You have 2 variables you need to tweak with: $job_number & $jobs_per_agent. # $job_number changes over time. It starts using the $job_start variable, and # increments by $jobs_per_agent. The highest number that will be processed is # $job_end. # # If you want to add stdin, do it here. # ## my $last_job = $job_number + $jobs_per_agent - 1; my $agent_command = "$agent_launcher_script $job_number $last_job $agent_launcher_args"; if ( -e $working_dir ) { $working_dir_arg = "-in \"$working_dir\""; } return "$working_dir_arg $agent_command"; } sub get_results_command { my ( $job_number ) = @_; ## # PUT YOUR XGRID RESULTS COMMAND HERE (where to save stuff) # # All you really need to tweak is how the $job_number variable is dealt with. # ## my $stdout = "-so \"$output_dir/stdout/${job_number}_stdout\""; my $stderr = "-se \"$output_dir/stderr/${job_number}_stderr\""; my $output = "-out \"$output_dir\""; return "$stdout $stderr $output"; } #----------------------------------------------------------------------# # You should not have to edit anything below (hopefully) #----------------------------------------------------------------------# system "mkdir -p \"$submit_log_location\""; system "mkdir -p \"$status_log_location\""; system "mkdir -p \"$results_log_location\""; system "mkdir -p \"$delete_log_location\""; system "mkdir -p \"$output_dir/stdout/\""; system "mkdir -p \"$output_dir/stderr/\""; CONTROLLER_CRASHED: # init iteration/control vars my $submitted_count = 0; # don't change my $total_jobs_submitted = 0; # don't change my $not_finished = 1; # don't change my %status_hash = (); # don't change my %results_hash = (); # don't change my %delete_hash = (); # don't change do { print "Starting loop!\n"; ## # SUBMIT ## while ( $submitted_count < $max_submitted_jobs and $total_jobs_submitted <= $#jobs ) { $this_job = $jobs[$total_jobs_submitted]; print "submitting $this_job\n"; my $my_submit_command = get_submit_command ( $this_job, $jobs_per_agent ); my $submit_command = "-job submit $my_submit_command"; my $submit_log = "$submit_log_location/${this_job}_submit"; # The following is not working as expected.... my $result = execute_command ($xgrid, $submit_command, $stop_xgrid_crashes, 1, $submit_log); $result =~ /jobIdentifier\s*=\s*([0-9]+)\s*;/; $status_hash{$this_job} = $1; $submitted_count++; $total_jobs_submitted++; } ## # STATUS ## while ( my ($job_number, $job_id) = each(%status_hash) ) { sleep $poll_wait; my $status_command = "-job status -id $job_id"; my $status_log = "$status_log_location/${job_number}_status"; my $status = execute_command ($xgrid, $status_command, $stop_xgrid_crashes, 1, $status_log); $status =~ /\s*jobStatus\s*=\s*([A-z]+)\s*;/; if ( ! defined $status or $status eq "" or $1 eq "" or ! defined $1 ) { # NEED TO double check to make sure the controller crashed... maybe a job just got deleted by something else. print `date` . "Controller has crashed: $status.\nStarting over."; controller_crash(); goto CONTROLLER_CRASHED; } print "getting status for job_number $job_number $job_id status is: $1\n"; if ( $1 eq "Finished" or $1 eq "Failed" ) { $results_hash{$job_number} = $job_id; delete $status_hash{$job_number}; } } ## # RESULTS ## while ( my ($job_number, $job_id) = each(%results_hash) ) { my $results_params = get_results_command( $job_number ); my $results_command = "-job results -id $job_id $results_params"; my $results_log = "$results_log_location/${job_number}_results"; execute_command ($xgrid, $results_command, $stop_xgrid_crashes, 0, $results_log); print "getting results for job_number $job_number, job_id $job_id\n"; $delete_hash{$job_number} = $job_id; delete $results_hash{$job_number}; } ## # DELETE ## while ( my ($job_number, $job_id) = each(%delete_hash) ) { my $delete_command = "-job delete -id $job_id"; my $delete_log = "$delete_log_location/${job_number}_delete"; execute_command ($xgrid, $delete_command, $stop_xgrid_crashes, 1, $delete_log); print "deleting job_number $job_number, job_id $job_id\n"; delete $delete_hash{$job_number}; $submitted_count--; } ## # Check if done ## $status_count = keys ( %status_hash ); $delete_count = keys ( %delete_hash ); $results_count = keys ( %results_hash ); if ( $total_jobs_submitted >= $#jobs and ! $results_count and ! $delete_count and ! $status_count ) { $not_finished = 0; } if ( $not_finished ) { print "Sleeping: "; for ( $x = $poll_sleep_value ; $x >= 0 ; $x-- ) { print "$x "; sleep 1; } } } while ( $not_finished ); exit 0; sub execute_command { my ( $command, $params, $catch_hangs, $require_results, $output ) = @_; my $repeat_execute; my $repeat_check; my $id; my $timer; my $command2; my $result; if ( $catch_hangs ) { do { $repeat_execute = 0; print "$command $params &> \"$output\" &\n"; system "$command $params &> \"$output\" &"; $id = ""; $timer = 1; do { $repeat_check = 0; sleep $sleep_time; open ( PROC, "/bin/ps -wwxo pid,command | grep $command | grep -v \"grep $command\" |" ); my @proclist = ; close PROC; $command2 = "$command $params"; $command2 =~ s/"//g; # print "The command to look for: $command2\n"; my @running_still = grep ( /$command2$/, @proclist ); if ( $#running_still >= 0 ) { if ( $timer > $hang_wait_time ) { my $pid = ( split ' ', $running_still[0] )[0]; print ( "killing $pid $command2\n" ); system "/bin/kill $pid"; $repeat_execute = 1; } else { # print ( "Waiting because still running: $command2\n" ); $timer += $sleep_time; $repeat_check = 1; } } elsif ( ! -e "$output") { print "Output file does not exist yet\n"; $repeat_check = 1; } else { open ( COMMANDOUT, "<$output" ) or die "Could not open $output"; @output = ; close ( COMMANDOUT ) or die "Could not close $output"; $result = "@output"; # print "result $result\n"; if ( $require_results and $result eq "" ) { # print "no results yet\n"; $repeat_check = 1; } } } while ( $repeat_check ); } while ( $repeat_execute ); } else { # print "$command $params\n"; $result = `$command $params`; } return $result; } sub controller_crash { # right now just exit. In the future, I should ping the controller and start over when it comes back up. exit; }