#!/usr/bin/env perl #----------------------------------------------------------------------------------------------- # st_archive - script to handle short term archiving #----------------------------------------------------------------------------------------------- use strict; use Cwd; use English; use Getopt::Long; use IO::File; use IO::Handle; use XML::LibXML; use Data::Dumper; use POSIX qw(strftime); use File::Path; use File::Basename; use File::Copy; use File::Find; use File::stat; use File::Glob; # the st_archive is run in the CASEROOT my $cimeroot = `./xmlquery CIMEROOT -value`; my @dirs = ("$cimeroot/utils/perl5lib" ); unshift @INC, @dirs; require Config::SetupTools; require Log::Log4perl; require Text::Glob; require Number::Compare; require File::Find::Rule; # Globals my %opts = (loglevel=>"INFO"); my $compmap; my $level = Log::Log4perl::Level::to_priority($opts{loglevel}); Log::Log4perl->easy_init({level=>$level, layout=>'%m%n'}); my $logger = Log::Log4perl::get_logger(); #----------------------------------------------------------------------------------------------- # Setting autoflush (an IO::Handle method) on STDOUT helps in debugging. It forces the test # descriptions to be printed to STDOUT before the error messages start. #----------------------------------------------------------------------------------------------- *STDOUT->autoflush(); # the global config hash holds all valid XML variables necessary for the short term st_archive to function my %config = SetupTools::getAllResolved(); #----------------------------------------------------------------------------------------------- # print out the command line usage #----------------------------------------------------------------------------------------------- sub usage { my $usgstatement; $usgstatement = < XML tag definitions are used to define how the files in the \$RUNDIR are distributed into corresponding subdirectories in the \$DOUT_S_ROOT archive location and whether or not copies of files with specified suffices are preserved in the RUNDIR in addition to being copied to the DOUT_S_ROOT archive locations. There are multiple markup tags defined with nested elements included as part of a well formed XML markup tag defintion. When making changes to the env_archive.xml file in a \$CASEROOT, it is recommended that the XML be validated against the xml schema defintion document in the \$CIMEROOT/cime_config/xml_schemas/archive.xsd. For systems that have the xmllint utility installed, execute the following: > cd \$CASEROOT > xmllint -schema \$CIMEROOT/cime_config/xml_schemas/archive.xsd env_archive.xml If the env_archive.xml file contains valid XML code, then output from this command should be a complete listing of the env_archive.xml file followed by the line: env_archive.xml validates Any errors in the XML will be reported by the xmllint command and should be resolved before running the st_archive script in either the case run script or as a stand-alone script. The XML markup tag defintions in < >, data type definitions denoted in [ ], and "|" separated valid options in ( ) for the env_archive.xml file are defined as follows: <\?xml version="1.0"\?> Model component name (note: cam\\* and clm\\? are historical for CAM and CLM model components). [string] (rest|cpl|dart|atm|lnd|rof|ice|ocn|glc|wav) Component root directory name to be created under the $\DOUT_S_ROOT location. These may not correspond to the model component name. Perl regular expression specification for "globbing" filenames for matching. Run the command "\$CASEROOT/st_archive -input" command to see the list of valid suffices. If the suffix begins with a "." then all files matching *.(regex_suffix) are included in the globbing. If the suffix begins with a character other than "." then all files matching exactly the (regex_suffix) are included in the globbing. [string] (logs|rest|hist|init|diag) subdirectory name to be created under the \$DOUT_S_ROOT/rootdir location [string] (true|false) Option to specify whether or not to keep the most recent copy of the file with matching suffix in the RUNDIR in addition to copying it to the \$DOUT_S_ROOT/rootdir/subdir location. ADDITIONAL XML / ENVIRONMENT VARIABLES USED BY THE SHORT TERM ARCHIVER THAT ARE INCLUDED IN THE ENV_RUN.XML FILE The following xml variables read from the \$CASEROOT/env_run.xml file are used to control the behavior of the short term archiver. They can be queried using the xmlquery tool and modified using the xmlchange tool. DOUT_S - [boolean] Checked in \$CASE.run script. If TRUE (default), then st_archive script called at the end of a job. DOUT_S_ROOT - [string] Valid directory path to stage short term archive files. DOUT_S_SAVE_INTERIM_RESTART_FILES - [boolean] If TRUE (default FALSE), perform short term archiving on all interim restart files, not just those at the end of the run. By default, this value is FALSE. The restart files are saved under the specific component directory of \$DOUT_S_ROOT/[comp_archive_spec/rootdir]/rest in addition to the top-level \$DOUT_S_ROOT/rest/[dname] directory. Interim restart files are created using the \$REST_N and \$REST_OPTION variables. The associated rpointer files are not saved with the interim restart files and need to be manually generated in order for these restart files to be used to restart a run. This is for expert users ONLY and requires expert knowledge. We will not document this further in this guide. USAGE st_archive [options] OPTIONS -help|-h Print usage to STDOUT. -input|-in List out the contents of the env_archive.xml datafile in a friendly format and check the env_archive.xml file for validity. -output|-out List out the contents of the archive directory. -undo move all the files from the \$DOUT_S_ROOT back into the \$RUNDIR -loglevel set level of output from this script DEBUG, INFO, WARN, ERROR, FATAL (default INFO) EOF #" added for emacs perl mode parsing $logger->info($usgstatement); exit(1); } #----------------------------------------------------------------------------------------------- # Parse command-line options. #----------------------------------------------------------------------------------------------- sub getOptions { GetOptions( "h|help" => \$opts{'help'}, "in|input" => \$opts{'input'}, "out|output" => \$opts{'output'}, "undo" => \$opts{'undo'}, "loglevel" => \$opts{loglevel} ); usage() if $opts{'help'}; } #----------------------------------------------------------------------------------------------- # check the run environment and create the associated directories #----------------------------------------------------------------------------------------------- sub checkRun { my $statusFile = shift; my $runComplete = 0; $logger->debug("In checkRun..."); # # check if DOUT_S_ROOT is defined # if( !defined $config{'DOUT_S_ROOT'} || uc($config{'DOUT_S_ROOT'}) eq 'UNSET' ) { $logger->logdie("st_archive: Error - XML variable DOUT_S_ROOT is required for root location of short-term achiver. Exiting..."); } # # check if DOUT_S_ROOT dir needs to be created (or not) and if it can be read - creation permissions defaults to user's umask setting # ### my $config{'DOUT_S_ROOT'} = dirname( $config{'DOUT_S_ROOT'} ); mkpath( $config{'DOUT_S_ROOT'}, {verbose => 1, error => \my $err_list} ) unless ( -d $config{'DOUT_S_ROOT'} ); chdir( $config{'DOUT_S_ROOT'} ) or $logger->logdie ("st_archive: Error - cannot access directory $config{'DOUT_S_ROOT'} with error $err_list. Exiting..."); # # check if interim restart files should be saved - if not, print a warning # if( ( uc($config{'DOUT_S_SAVE_INTERIM_RESTART_FILES'}) eq 'FALSE' || uc($config{'DOUT_S_SAVE_INTERIM_RESTART_FILES'}) eq 'UNSET' ) && ($config{'REST_N'} < $config{'STOP_N'}) ) { $config{'DOUT_S_SAVE_INTERIM_RESTART_FILES'} = 'FALSE'; $logger->warn("st_archive: Warning - restart files from end of run will be saved, interim restart files will be deleted."); } # # check if the run completed successfully # if( -f $statusFile ) { open my $CASESTATUS, "<", "$statusFile" or die qq(st_archive: unable to open $statusFile. Exiting...\n); while( <$CASESTATUS> ) { chomp $_; if( /^run SUCCESSFUL/ ) { $runComplete = 1; } } close( $CASESTATUS ); } return $runComplete; } #----------------------------------------------------------------------------------------------- # read the archive XML file - env_archive.xml #----------------------------------------------------------------------------------------------- sub readXMLin { my $filename = shift; $logger->debug("In readXMLin..."); my $parser = XML::LibXML->new(); my $xml = $parser->parse_file($filename); return $xml; } #----------------------------------------------------------------------------------------------- # list archive XML file contents #----------------------------------------------------------------------------------------------- sub listXMLin { my ($xml) = shift; $logger->debug("In listXMLin..."); my $rootel = $xml->getDocumentElement(); my $schema = < [string:root directory name after DOUT_S_ROOT] (rest|cpl|dart|atm|lnd|rof|ice|ocn|glc|wav) [string: specify if the most recent copy of the matching file should be saved in the RUNDIR] (true|false) Run st_archive --help for complete tag schema descriptions. EOF # added for emacs perl mode parsing print $schema; my $elname = $rootel->getName(); $logger->info("Root element is a $elname and it contains ..."); # # list out the components nodes # my @comps = ($xml->findnodes('//comp_archive_spec')); foreach my $comp (@comps) { my $compname = $comp->getAttribute('name'); my $rootdir = $comp->findvalue('./rootdir'); my $ninst = getcompninst($compname); my $multi = "false"; if($ninst > 1){ $multi = "true"; } $logger->info("\n============================================================================"); $logger->info("Component name = $compname"); $logger->info("rootdir = $rootdir"); $logger->info("multiple-instance support = $multi"); my @files = $comp->findnodes('./files/file_extension'); foreach my $file (@files) { my $suffix = $file->getAttribute('regex_suffix'); my $subdir = $file->findvalue('./subdir'); my $keep_last_in_rundir = $file->findvalue('./keep_last_in_rundir'); $logger->info("\n***** File extension specification"); $logger->info(" regex_suffix = $suffix"); $logger->info(" subdir = $config{'DOUT_S_ROOT'}/$config{'CASENAME'}$rootdir/$subdir"); $logger->info(" keep_last_in_rundir = $keep_last_in_rundir"); } } } #----------------------------------------------------------------------------------------------- # get the date name from the most current coupler restart file #----------------------------------------------------------------------------------------------- sub getDname { $logger->debug("In getDname..."); my $DIR = $config{'RUNDIR'}; my $CASE = $config{'CASE'}; opendir( my $dh, $DIR ) or $logger->logdie("st_archive: Error - cannot open directory $DIR. Exiting..."); my @files = sort( grep(m/^${CASE}\.cpl\.r.(.*).nc$/, readdir $dh) ); my @names = split(/\./,$files[$#files]); my $dname = ''; if (@names) { $dname = $names[-2]; } else { $logger->logdie("st_archive: Error - cannot find a cpl.r.*.nc file in directory $DIR. Exiting..."); } $logger->debug("Cpl dName: $dname"); return $dname; } #----------------------------------------------------------------------------------------------- # recursive file tree traverse function to print out the archive #----------------------------------------------------------------------------------------------- sub listArchive { my ($thing) = @_; $logger->debug("In listArchive: $thing ..."); $logger->info("$thing "); return if not -d $thing; opendir my $dh, $thing or die; while (my $sub = readdir $dh) { next if $sub eq '.' or $sub eq '..'; listArchive( "$thing/$sub" ); } closedir $dh; return; } #----------------------------------------------------------------------------------------------- # get the NINST_[comp] from the config hash to determine if multiple instances are supported #----------------------------------------------------------------------------------------------- sub getcompninst { my ($comp) = @_; # the comp variable corresponds to the name attribute of the comp_archive_spec # in the env_archive.xml. Only need the first word of the comp and not the # regex punctuation $comp = (split('\[',$comp))[0]; # this loop is only executed the first time the subroutine is called # in order to load the compmap hash with all the actual model names # for a given component if(! defined $compmap){ foreach( qw(ATM CPL OCN WAV GLC ICE ROF LND )){ if($comp = $config{"COMP_".$_}){ $compmap->{$comp} = $_; } } } # with the compmap hash, look for the NINST value in the global config hash my $ninstname = "NINST_".$compmap->{$comp}; $logger->debug("Looking for $ninstname in config"); if(defined $config{$ninstname}){ return $config{$ninstname}; }else{ return 1; } } #----------------------------------------------------------------------------------------------- # undoArchive routine loops through the DOUT_S_ROOT getting the list of archived files # and then moves the file back into the RUNDIR. #----------------------------------------------------------------------------------------------- sub undoArchive { my @archivefiles; find( sub { return unless -f; push @archivefiles, $File::Find::name; }, $config{'DOUT_S_ROOT'}); my $numArchivefiles = @archivefiles; if( $numArchivefiles > 0 ) { foreach my $file (@archivefiles) { move( $file, $config{'RUNDIR'} ); } } } #----------------------------------------------------------------------------------------------- # moveFiles routine to move files into the DOUT_S_ROOT and check whether or not to preserve a # working copy in the rundir and the rest/dname dir #----------------------------------------------------------------------------------------------- sub moveFiles { my $keep_last_in_rundir = shift; my $suffix = shift; my $destdir = shift; my $restdir = shift; my $dname = shift; my $runfilesref = shift; my @runfiles = @{$runfilesref}; my %time = ''; my $restFile = ''; $logger->debug("In moveFiles..."); # # make sure operating from within the RUNDIR # chdir( $config{'RUNDIR'} ); # # filter the file list in the rundir to match the suffix value from the xml # my @files = sort( grep(/${suffix}/, @runfiles)); # # get the name of the most recent file in the $source list # sorting on the date string in the filename # my $numfiles = @files; if( $numfiles > 0 ) { # # get the most recent copy to keep in the RUNDIR # my $keepFile = $files[$#files]; if( $keep_last_in_rundir eq 'true' ) { # # copy the last file into the destination dir or the restdir # $logger->debug("keepFile = $keepFile"); copy( $keepFile, $destdir ); foreach my $file (@files) { if( $file ne $keepFile && -f $file ) { # # move all the remaining files except the last file into the destination dir # move( $file, $destdir ); } } } else { foreach my $file (@files) { move( $file, $destdir ); } } } # # splice the filename out of the runfiles array so it's not revisited # should only match on one filename since they must be unique filenames in the rundir # foreach my $file (@files) { if ( -f $file ) { my @index = grep { $runfiles[$_] eq $file } 0..$#runfiles; splice @runfiles, $index[0], 1; } } # # also get the file that matches the dname for this suffix and # copy to the restdir for a complete restart set # ## my @restFiles = grep(/${dname}/, @files); my $CASE = $config{'CASE'}; my @intFiles = grep(/${dname}/, @files); my @restFiles = grep(/${CASE}/, @intFiles); if( $#restFiles == 0 ) { my @path = split /\//, $restFiles[$#restFiles]; if ( $#path >= 0 ) { $restFile = $path[$#path]; if ( -f $destdir . "/" . $restFile ) { copy( $destdir . "/" . $restFile, $restdir . "/" . $restFile ); } } } elsif( $#restFiles > 0 ) { $logger->info("st_archive: multiple restart files found for suffix=$suffix"); } return $numfiles, \@runfiles; } #----------------------------------------------------------------------------------------------- # short term archive routine #----------------------------------------------------------------------------------------------- sub archiveProcess { my ($XMLin) = shift; my $dname = shift; my $runfilesref = shift; my @runfiles = @{$runfilesref}; $logger->debug("In archiveProcess..."); my %fExt = (); my %comp = (); my ($source, $destdir, $linkdir, $outfile) = ''; # # create the restdir for a complete restart set with the coupler extension # my $restdir = qq($config{'DOUT_S_ROOT'}/rest/$dname); mkpath( $restdir, {verbose => 1, error => \my $err_list} ) unless ( -d $restdir ); chdir ($restdir ) or $logger->logdie("st_archive: Error - cannot access directory $restdir with error $err_list. Exiting..."); # # copy rpointer files for all components to the $restdir # chdir ($config{'RUNDIR'}); my @rpointers = <$config{'RUNDIR'}/rpointer.*>; foreach my $rpointer (@rpointers) { copy( $rpointer, $restdir ) or $logger->logdie("st_archive: Error - cannot copy $rpointer file to directory $restdir. Exiting..."); } # # main loop through the comp_archive_spec XML elements handling the associated files accordingly # my @xmlcomps = ($XMLin->findnodes('//comp_archive_spec')); foreach my $xmlcomp (@xmlcomps) { my $str = $xmlcomp->getAttribute('name'); $str =~ s/^\s+//; $str =~ s/\s+$//; my $compname = $str; my $rootdir = $xmlcomp->findvalue('./rootdir'); # # get the number of instances running for each component that supports multiple instances # my $ninst = getcompninst($compname); # # loop through the file extensions for this component to copy over files with those matching extensions to the ARCHIVE_DIR locations # my @infiles = ($xmlcomp->findnodes('./files/file_extension')); foreach my $infile (@infiles) { my $suffix = $infile->getAttribute('regex_suffix'); my $subdir = $infile->findvalue('./subdir'); my $keep_last_in_rundir = $infile->findvalue('./keep_last_in_rundir'); # # setup the subdir for this component # $destdir = qq($config{'DOUT_S_ROOT'}/$rootdir/$subdir); mkpath( $destdir, {verbose => 1, error => \my $err_list} ) unless ( -d $destdir ); chdir( $destdir ) or $logger->logdie("st_archive: Error - cannot access directory $destdir with error $err_list. Exiting..."); # # main loop to move files # for( my $i = 1; $i <= $ninst; $i++ ) { my $ninst_suffix = ''; if( $ninst > 1 ) { $ninst_suffix = sprintf( '%04d', $i ); $ninst_suffix = qq(_$ninst_suffix); } # # build up the new suffix based on whether the XML regex_suffix starts with a "." # my $newSuffix = qq($suffix); if( substr($suffix, 0, 1) eq '.' ) { if( $subdir eq 'logs' ) { # # component logs are handled a little differently # $newSuffix = qq($rootdir$ninst_suffix$suffix); } else { $newSuffix = qq($config{'CASE'}.$compname$ninst_suffix$suffix); } } # # move the Files to the DOUT_S_ROOT correct sub-directory # my ($numfiles, $runfilesref) = moveFiles( lc($keep_last_in_rundir), $newSuffix, $destdir, $restdir, $dname, \@runfiles ); @runfiles = @{$runfilesref}; } } } } #---------------------------------------------------------------------------------------------- # remove restart duplicate files from the comp/rest and rest/dname directories #----------------------------------------------------------------------------------------------- sub removeDups { my $dname = shift; my $restdir = qq($config{'DOUT_S_ROOT'}/rest/$dname); chdir( $restdir ) or $logger->logdie("st_archive: Error - cannot access directory $restdir in removeDups. Exiting..."); my @restfiles; find( sub { return unless -f; push @restfiles, $File::Find::name; }, $restdir); foreach my $restfile (@restfiles) { my @path = split /\//, $restfile; my $file = @path[-1]; my @matchfiles = File::Find::Rule->file->name($file)->in($config{'DOUT_S_ROOT'}); foreach my $matchfile (@matchfiles) { if( $matchfile =~ /\/rest\// && $matchfile ne $restfile && $matchfile !~ /\/hist\// && $matchfile !~ /rpointer/) { $logger->debug("Dedebugng matchfile=$matchfile matches restfile=$restfile"); unlink( $matchfile ); } } } } #----------------------------------------------------------------------------------------------- # Main program #----------------------------------------------------------------------------------------------- sub main { my $dname; getOptions(); if( defined $opts{'help'} ) { usage(); exit 0; } my $runComplete = checkRun( "$config{'CASEROOT'}/CaseStatus" ); my $XMLin = readXMLin( "$config{'CASEROOT'}/env_archive.xml" ); if( defined $opts{'input'} ) { # list the input env_archive.xml values listXMLin( $XMLin ); } elsif( defined $opts{'output'} && $runComplete ) { # list the directories in the $config{'DOUT_S_ROOT'} directory $dname = getDname(); $logger->info("Short-term archive listing of $config{'DOUT_S_ROOT'}:"); listArchive( $config{'DOUT_S_ROOT'} ); } elsif ( defined $opts{'undo'} ) { # # unlink the archive process and move all files in the archive back to rundir # undoArchive(); } elsif ( $runComplete ) { # load the runfiles global array with all the files in the RUNDIR my @runfiles; find( sub { return unless -f; push @runfiles, $File::Find::name; }, $config{'RUNDIR'}); # get the date name from the most current coupler log file $dname = getDname(); # run the archive process archiveProcess( $XMLin, $dname, \@runfiles ); if( uc($config{'DOUT_S_SAVE_INTERIM_RESTART_FILES'}) eq 'TRUE' ) { # remove restart duplicate files from the comp/rest and rest/dname directories removeDups( $dname ); } $logger->info("st_archive process complete."); exit 0; } else { $logger->info("st_archive: run is not yet complete or was not successful. Unable to perform option..."); exit 1; } } main(@ARGV) unless caller;