Source code for pysmac.utils.state_merge

import os
import glob
import operator
import errno
import filecmp
import shutil
import numpy


from .smac_output_readers import *



[docs]def find_largest_file (glob_pattern):
    """ Function to find the largest file matching a glob pattern.

    Old SMAC version keep several versions of files as back-ups. This
    helper can be used to find the largest file (which should contain the
    final output). One could also go for the most recent file, but that
    might fail when the data is copied.

    :param glob_pattern: a UNIX style pattern to apply
    :type glob_pattern: string

    :returns: string -- largest file matching the pattern 
    """
    fns = glob.glob(glob_pattern)

    if len(fns) == 0:
        raise RuntimeError("No file matching pattern \'{}\' found!".format(glob_pattern))

    f_name = ""
    f_size = -1

    for fn in fns:
        s = os.lstat(fn).st_size
        if (s > f_size):
            f_size = s
            f_name = fn
    return(f_name)


[docs]def read_sate_run_folder(directory, rar_fn = "runs_and_results-it*.csv",inst_fn = "instances.txt" , feat_fn = "instance-features.txt" , ps_fn = "paramstrings-it*.txt"):    
    """ Helper function that can reads all information from a state_run folder.
    
    To get all information of a SMAC run, several different files have
    to be read. This function provides a short notation for gathering
    all data at once.
    
    :param directory: the location of the state_run_folder
    :type directory: str
    :param rar_fn: pattern to find the runs_and_results file
    :type rar_fn: str
    :param inst_fn: name of the instance file
    :type inst_fn: str
    :param feat_fn: name of the instance feature file. If this file is not found, pysmac assumes no instance features.
    :type feat_fn: str
    :param ps_fn: name of the paramstrings file
    :type ps_fn: str
    
    :returns: tuple -- (configurations returned by read_paramstring_file,\n
        instance names returned by read_instance_file,\n
        instance features returned by read_instance_features_file,\n
        actual run data returned by read_runs_and_results_file)
    """
    print(("reading {}".format(directory)))
    configs = read_paramstrings_file(find_largest_file(os.path.join(directory,ps_fn)))
    instance_names = read_instances_file(find_largest_file(os.path.join(directory,inst_fn)))
    runs_and_results = read_runs_and_results_file(find_largest_file(os.path.join(directory, rar_fn)))

    full_feat_fn = glob.glob(os.path.join(directory,feat_fn))
    if len(full_feat_fn) == 1:      
        instance_features = read_instance_features_file(full_feat_fn[0])
    else:
        instance_features = None

    return (configs, instance_names, instance_features, runs_and_results)



[docs]def state_merge(state_run_directory_list, destination, 
                check_scenario_files = True, drop_duplicates = False,
                instance_subset = None):
    """ Function to merge multiple state_run directories into a single
    run to be used in, e.g., the fANOVA.
    
    To take advantage of the data gathered in multiple independent runs,
    the state_run folders have to be merged into a single directory that
    resemble the same structure. This allows easy application of the
    pyfANOVA on all run_and_results files.
    
    :param state_run_directory_list: list of state_run folders to be merged
    :type state_run_directory_list: list of str
    :param destination: a directory to store the merged data. The folder is created if needed, and already existing data in that location is silently overwritten.
    :type destination: str
    :param check_scenario_files: whether to ensure that all scenario files in all state_run folders are identical. This helps to avoid merging runs with different settings. Note: Command-line options given to SMAC are not compared here!
    :type check_scenario_files: bool
    :param drop_duplicates: Defines how to handle runs with identical configurations. For deterministic algorithms the function's response should be the same, so dropping duplicates is safe. Keep in mind that every duplicate effectively puts more weight on a configuration when estimating parameter importance.
    :type drop_duplicates: bool
    :param instance_subset: Defines a list of instances that are used for the merge. All other instances are ignored. (Default: None, all instances are used)
    :type instance_subset: list
    """

    configurations = {}
    instances = {}
    runs_and_results = {}
    ff_header= set()
    
    i_confs = 1;
    i_insts = 1;


    # make sure all pcs files are the same
    pcs_files = [os.path.join(d,'param.pcs') for d in state_run_directory_list]
    if not all([filecmp.cmp(fn, pcs_files[0]) for fn in pcs_files[1:]]):
        raise RuntimeError("The pcs files of the different runs are not identical!")

    #check the scenario files if desired
    scenario_files = [os.path.join(d,'scenario.txt') for d in state_run_directory_list]
    if check_scenario_files and not all([filecmp.cmp(fn, scenario_files[0]) for fn in scenario_files[1:]]):
        raise RuntimeError("The scenario files of the different runs are not identical!")

    for directory in state_run_directory_list:
        try:
            confs, inst_names, tmp , rars = read_sate_run_folder(directory)
            (header_feats, inst_feats) = tmp if tmp is not None else (None,None)
        
        except:
            print(("Something went wrong while reading {}. Skipping it.".format(directory)))
            continue
        
        # confs is a list of dicts, but dicts are not hashable, so they are
        # converted into a tuple of (key, value) pairs and then sorted
        confs = [tuple(sorted(d.items())) for d in confs]        
        
        # merge the configurations
        for conf in confs:
            if not conf in configurations:
                configurations[conf] = {'index': i_confs}
                i_confs += 1
        # merge the instances
        ignored_instance_ids = []
        for i in range(len(inst_names)):
            
            if instance_subset is not None and inst_names[i][0] not in instance_subset:
                ignored_instance_ids.append(i)
                continue
            
            if not inst_names[i][0] in instances:
                instances[inst_names[i][0]] = {'index': i_insts}
                instances[inst_names[i][0]]['features'] =  inst_feats[inst_names[i][0]] if inst_feats is not None else None
                instances[inst_names[i][0]]['additional info'] = ' '.join(inst_names[i][1:]) if len(inst_names[i]) > 1 else None
                i_insts += 1
            else:
                if (inst_feats is None):
                    if not (instances[inst_names[i][0]]['features'] is None):
                        raise ValueError("The data contains the same instance name ({}) twice, but once with and without features!".format(inst_names[i]))
                elif not numpy.all(instances[inst_names[i][0]]['features'] == inst_feats[inst_names[i][0]]):
                    raise ValueError("The data contains the same instance name ({}) twice, but with different features!".format(inst_names[i]))
                pass
        
        # store the feature file header:
        if header_feats is not None:
            ff_header.add(",".join(header_feats))
        
            if len(ff_header) != 1:
                raise RuntimeError("Feature Files not consistent across runs!\n{}".format(header_feats))
        
        
        if len(rars.shape) == 1:
            rars = numpy.array([rars])

        for run in rars:
            # get the local configuration and instance id
            lcid, liid = int(run[0])-1, int(run[1])-1

            if liid in ignored_instance_ids:
                continue

            # translate them into the global ones
            gcid = configurations[confs[lcid]]['index']
            giid = instances[inst_names[liid][0]]['index']

            # check for duplicates and skip if necessary
            if (gcid, giid) in runs_and_results:
                if drop_duplicates:
                    #print('dropped duplicate: configuration {} on instace {}'.format(gcid, giid))
                    continue
                else:
                    runs_and_results[(gcid, giid)].append(run[2:])
            else:
                runs_and_results[(gcid, giid)] = [run[2:]]

    # create output directory
    try:
        os.makedirs(destination)
    except OSError as e:
        if e.errno == errno.EEXIST:
            pass
        else:
            raise
        
    # create all files, overwriting existing ones
    shutil.copy(pcs_files[0], destination)
    shutil.copy(scenario_files[0], destination)
        

    with open(os.path.join(destination, 'instances.txt'),'w') as fh:
        sorted_instances = []
        for name in instances:
            if instances[name]['additional info'] is not None:
                sorted_instances.append( (instances[name]['index'], name + ' ' + instances[name]['additional info']) )
            else:
                sorted_instances.append( (instances[name]['index'], name) )
        
        sorted_instances.sort()
        fh.write('\n'.join(map(operator.itemgetter(1), sorted_instances)))
        fh.write('\n')

    with open(os.path.join(destination, 'runs_and_results-it0.csv'),'w') as fh:
        cumulative_runtime = 0.0
        
        fh.write("Run Number,Run History Configuration ID,Instance ID,"
                 "Response Value (y),Censored?,Cutoff Time Used,"
                 "Seed,Runtime,Run Length,"
                 "Run Result Code,Run Quality,SMAC Iteration,"
                 "SMAC Cumulative Runtime,Run Result,"
                 "Additional Algorithm Run Data,Wall Clock Time,\n")
        run_i = 1
        for ((conf,inst),res) in list(runs_and_results.items()):
            for r in res:
                fh.write('{},{},{},'.format(run_i, conf, inst))
                fh.write('{},{},{},'.format(r[0], int(r[1]), r[2]))
                fh.write('{},{},{},'.format(int(r[3]), r[4], r[5]))
                fh.write('{},{},{},'.format(int(r[6]), r[7], 0))
                
                cumulative_runtime += r[4]
                if r[10] == 2:
                    tmp = 'SAT'       
                if r[10] == 1:
                    tmp = 'UNSAT'
                if r[10] == 0:
                    tmp = 'TIMEOUT'
                if r[10] == -1:
                    tmp = 'CRASHED'
                
                fh.write('{},{},,{},'.format(cumulative_runtime,tmp, r[11]))
                fh.write('\n')
                run_i += 1

    with open(os.path.join(destination, 'paramstrings-it0.txt'),'w') as fh:
        sorted_confs = [(configurations[k]['index'],k) for k in list(configurations.keys())]
        sorted_confs.sort()
        for conf in sorted_confs:
            fh.write("{}: ".format(conf[0]))
            fh.write(", ".join(["{}='{}'".format(p[0],p[1]) for p in conf[1]]))
            fh.write('\n')

    
    #print(instances.values())
    
    
    if header_feats is not None:
        with open(os.path.join(destination, 'instance-features.txt'),'w') as fh:
            fh.write("instance," + ff_header.pop())
            sorted_features = [(instances[inst]['index'], inst + ',' + ",".join(list(map(str, instances[inst]['features']))) ) for inst in instances]
            sorted_features.sort()
            fh.write('\n'.join([ t[1] for t in sorted_features]))

    return(configurations, instances, runs_and_results, sorted_instances, sorted_confs, inst_feats)