Source code for prms_python.data

# -*- coding: utf-8 -*-
'''
data.py -- holds ``Data`` class for standard PRMS climate input data.
'''

import pandas as pd
from shutil import copyfile

[docs]class Data(object):
    """
    Object to access or create a PRMS data file with ability to load/assign it to a 
    date-time indexed pandas.DataFrame for data management, analysis and visualization. 
    It can be used to build a new PRMS data file from user defined metadata and a 
    ``pandas.DataFrame`` of PRMS datetime-indexed climatic forcing and observation 
    variables.

    The class properties ``metadata`` and ``data_frame`` can be later assigned if no
    ``base_file`` is given on initialization, allowing for the creation of PRMS climatic
    forcing file in a Python environment.

    Keyword Arguments:
        base_file (str, optional): path to standard PRMS data file 
        na_rep (int, optional): how to represent missing values default = -999

    Attributes:
        date_header (list): date and time header for PRMS data file 
        valid_input_variables (tuple): valid hydro-climate variables for PRMS data file

    Note:
        If using the ``Data`` class to create a new data file, it is up to the user
        to ensure that the metadata and :class:`pandas.DataFrame` assigned are correct 
        and compatible.  

    """

    ## data file constant attributes
    date_header = ['year',
               'month',
               'day',
               'hh',
               'mm',
               'sec']

    valid_input_variables = ('gate_ht',
                         'humidity',
                         'lake_elev',
                         'pan_evap',
                         'precip',
                         'rain_day',
                         'runoff',
                         'snowdepth',
                         'solrad',
                         'tmax',
                         'tmin',
                         'wind_speed')

    def __init__(self, base_file=None, na_rep=-999):
        self.base_file = base_file
        self.na_rep = na_rep
        self._metadata = None 
        self._data_frame = None 

    @property
    def metadata(self):
        """
        :obj:`dict`:A property that gets and sets the header information from 
        a standard PRMS climate input data file held in a Python dictionary. As 
        a property it can be assigned directly to overwrite or create a new PRMS 
        data file. As such the user is in control and must supply the correct 
        syntax for PRMS standard data files, e.g. text lines before header should 
        begin with "//". Here is an example of the information gathered and held in 
        this attribute:

        Example:
            >>> data.metadata
                {
                 'data_startline' : 6,
                 'data_variables' : ['runoff 1', 'runoff 2', 'tmin', 'tmax', 'ppt']
                 'text_before_header' : "Title of data file \\n //some comments\\nrunoff 2
                                         \\ntmin 1\\ntmax 1\\nppt 1\\nrunoff 2\\ntmin 1
                                         \\ntmax 1\\nppt 1\\n
                                         ########################################\\n"
                } 
    
        Note:
            When assigning or creating a new data file, the ``Data.write`` method will
            assign the appropriate date header that follows the line of number signs "#".

        Raises:
            ValueError: if data in metadata is accessed before data is assigned, 
                e.g. if accessed to write a PRMS data file from a ``Data`` instance 
                that was initialized without a valid PRMS data file.
            TypeError: if an object that is not a Python dictionary is assigned.
        
        """
        # to avoid overwriting pre-assigned data, check if already exists
        if isinstance(self._metadata, dict):
            return self._metadata
        elif not self.base_file:
            raise ValueError('No data file was given on initialization')
            
        ## starting list for variable names in data file
        input_data_names = []
        text_before_header = str()
        ## open data file and read header information
        with open(self.base_file, 'r') as inf:
            for idx,l in enumerate(inf):
                text_before_header+=l
                if idx == 0: ## first line always string identifier of the file- may use later
                    data_head = l.rstrip()
                elif l.startswith('/'): ## comment lines
                    continue
                if l.startswith(Data.valid_input_variables): 
                    h = l.split() ## split, first name and second number of columns
                    if int(h[1]) > 1: ## more than one input time series of a particular variable
                        for el in range(int(h[1])):
                            tmp = '{var_name} {var_ind}'.format(var_name = h[0], var_ind = el+1)
                            input_data_names.append(tmp)
                    elif int(h[1]) == 1:
                        input_data_names.append(h[0])
                if l.startswith('#'): ## end of header info and begin time series input data
                    data_startline = idx+1 ## 0 indexed line of first data entry
                    break
                    
        self._metadata = dict([('data_startline',data_startline),
                               ('data_variables',input_data_names),
                               ('text_before_header',text_before_header)])
        return self._metadata
    
    @metadata.setter
    def metadata(self, dic):
        if not isinstance(dic, dict):
            raise TypeError('Must assign a Python dictionary for new Data object/file metadata')
        self._metadata = dic
    
    @property
    def data_frame(self):
        """
        A property that gets and sets the climatic forcing data for a standard PRMS 
        climate input data file as a :class:`pandas.DataFrame`.

        Example:
            d is a Data instance, calling 
    
            >>> d.data_frame 
                input variables	runoff 1 runoff 2 runoff 3 precip tmax tmin
                date						
                1996-12-27	0.54	1.6	NaN	0.0	46	32.0
                1996-12-28	0.65	1.6	NaN	0.0	45	24.0
                1996-12-29	0.80	1.6	NaN	0.0	44	28.0
                1996-12-30	0.90	1.6	NaN	0.0	51	33.0
                1996-12-31	1.00	1.7	NaN	0.0	47	32.0 
        
            shows the date-indexed ``pd.DataFrame`` of the input data that is created 
            when a ``Data`` object is initiated if given a valid ``base_file``, i.e. 
            file path to a PRMS climate data file. 

        Raises:
            ValueError: if attribute is accessed before either assigning a PRMS data
                file on ``Data`` initialization or not assigning a compatabile 
                date-indexed ``pandas.DataFrame`` of hydro-climate variables. 
            TypeError: if a data type other than ``pandas.DataFrame`` is assigned. 
        """
        if not self._metadata:
            self.metadata
        elif not isinstance(self._data_frame, pd.DataFrame) and self.base_file == None:
            raise ValueError('No data base_file given on initialization, '\
                             'therefore you must assign a DataFrame'\
                              +' before accessing the .data_frame attribute!')
        # to avoid overwriting pre-assigned data
        elif isinstance(self._data_frame, pd.DataFrame):
            return self._data_frame
        
        df = pd.read_csv(self.base_file, header = -1, skiprows = self.metadata['data_startline'],
                         delim_whitespace = True, na_values = [self.na_rep]) ## read data file
        df.columns = Data.date_header + self.metadata['data_variables']
        date = pd.Series(pd.to_datetime(df.year * 10000 + df.month * 100 +\
             df.day, format = '%Y%m%d'), index = df.index)
        df.index = pd.to_datetime(date) ## assign datetime index
        df.drop(Data.date_header, axis = 1, inplace = True) ## unneeded columns
        df.columns.name = 'input variables' ; df.index.name = 'date'
        self._data_frame = df
        return self._data_frame
    
    @data_frame.setter
    def data_frame(self, df):
        if not isinstance(df, pd.DataFrame):
            raise TypeError("Must assign a Pandas.DataFrame object for PRMS data input")
        self._data_frame = df
        
[docs]    def modify(self, func, vars_to_adjust):
        """
        Apply a user defined function to one or more variable(s) in the data file.

        The ``modify`` method allows for inplace modification of one or more 
        time series inputs in the data file based on a user defined function.  
        
        Arguments:
            func (function): function to apply to each variable in vars_to_adjust
            vars_to_adjust (list or tuple): collection of variable names to apply func to.

        Returns: 
            None

        Example:
            Here is an example of loading a data file, modifying the temperature inputs 
            (*tmin* and *tmax*) by adding two degrees to each element, and rewritting the 
            modified data to disk,

            >>> d = Data('path_to_data_file')
            >>> def f(x):
                    return x + 2
            >>> d.modify(f,['tmax','tmin'])
            >>> d.write('data_temp_plus_2')
        """

        if not isinstance(self._data_frame, pd.DataFrame):
            self.data_frame # will raise ValueError from data_frame property
        for v in vars_to_adjust:
            self._data_frame[v] = self._data_frame[v].apply(func)

[docs]    def write(self, out_path):
        """
        Writes the current state of the ``Data`` to PRMS text format
        utilizing the ``Data.metadata`` and ``Data.data_frame`` instance
        properties. If ``Data.data_frame`` was never accessed or assigned
        new values then this method simply copies the original PRMS
        data file to ``out_path``.

        Arguments:
            out_path (str): full path to save or copy the current PRMS data 
                in PRMS text format.

        Returns:
            None

        Raises:
            ValueError: if the ``write`` method is called without assigning either 
                an initial data (``base_file``) path or assigning correct ``metadata`` 
                and ``data_frame`` properties. 

        """
        # if file data was never accessed- unchanged
        if not isinstance(self._data_frame, pd.DataFrame): 
            if self.base_file:
                copyfile(self.base_file, out_path)
            else: # if data not from original file and dataframe never assigned
                raise ValueError('No data base_file was given and'\
                                +' no data was assigned!')
                
        ## reconstruct PRMS data file format, don't overwrite date-indexed
        else:
            df = self._data_frame[self.metadata['data_variables']]
            df['year'] = self._data_frame.index.year
            df['month'] = self._data_frame.index.month
            df['day'] = self._data_frame.index.day
            df['hh'] = df['mm'] = df['sec'] = 0
            df = df[Data.date_header + self._metadata['data_variables']]
            with open(out_path,'w') as outf: # write comment header then data
                outf.write(self._metadata['text_before_header'])
                df.to_csv(outf, sep=' ', header=None,\
                          index=False, na_rep=self.na_rep)
Navigation

Related Topics

Donate/support

Source code for prms_python.data