Core functionality

Basic processing

Logging


source

init_console_logging

 init_console_logging (name=None, level=20, timestamp=True)

Setup none-blocking stream handler for sending loggin to the console.

Exported source
def init_console_logging(name=None, level=logging.INFO, timestamp=True):
    '''Setup none-blocking stream handler for sending loggin to the console.'''

    # Only if no handlers defined.
    if not logging.getLogger(name).handlers:

        logger = logging.getLogger()
        logger.setLevel(level)

        console = logging.StreamHandler()
        console.setLevel(level)

        # set a format which is simpler for console use
        if timestamp:
            formatter = logging.Formatter("%(asctime)s %(levelname)s\t%(process)d\t%(name)s\t%(filename)s\t%(lineno)d\t%(message)s", datefmt='%Y-%m-%dT%H:%M:%S%z')
        else:
            formatter = logging.Formatter("%(levelname)s\t%(process)d\t%(name)s\t%(filename)s\t%(lineno)d\t%(message)s")
            
        #formatter = logging.Formatter("%(asctime)s %(levelname)s\t%(process)d\t%(name)s\t%(filename)s\t%(lineno)d\t%(message)s", datefmt='%Y-%m-%dT%H:%M:%S%z')

        # tell the handler to use this format
        console.setFormatter(formatter)

        # add the handler to the root logger
        logger.addHandler(console)
        return logger
    else:
        logging.getLogger(name).info(f'There already is a logger installed for {name}.')

Machine information

get_machine_info()
{'OS Name': 'nt',
 'System Name': 'Windows',
 'Release': '11',
 'Version': '10.0.26100',
 'Architecture': ('64bit', 'WindowsPE'),
 'Machine': 'AMD64',
 'Processor': 'Intel64 Family 6 Model 85 Stepping 4, GenuineIntel',
 'Node Name': 'werkdoos'}

Strings


source

snake_case_to_camel_case

 snake_case_to_camel_case (snake_case:str)
Exported source
@lru_cache(128)
def snake_case_to_camel_case(snake_case:str) -> str:
    splittext = snake_case.split('_')
    return ''.join([x.capitalize() if n > 0 else x for x,n in zip(splittext, range(len(splittext)))])

Numpy data in JSON


source

NumpyEncoder

 NumpyEncoder (skipkeys=False, ensure_ascii=True, check_circular=True,
               allow_nan=True, sort_keys=False, indent=None,
               separators=None, default=None)

Custom encoder for numpy data types

Timeseries dataframes

Timeseries data is a cornerstone of our data manipulation and most processing is on them

set_time_index_zone

Processing may depend on proper timezone awareness, this utility to set the timezone on a datetime index


source

set_time_index_zone

 set_time_index_zone (df:pandas.core.frame.DataFrame, timezone)

*Sets the time zone of the index of a pandas DataFrame.

Args: df (pd.DataFrame): The DataFrame whose index time zone is to be set. timezone (str): The desired time zone.

Returns: pd.DataFrame: The modified DataFrame with its index time zone set to the specified time zone.

Raises: None

Examples: >>> df = pd.DataFrame({‘A’: [1, 2, 3]}, index=pd.DatetimeIndex([‘2022-01-01’, ‘2022-01-02’, ‘2022-01-03’])) >>> set_time_index_zone(df, ‘Europe/Berlin’) A 2022-01-01 1 2022-01-02 2 2022-01-03 3 DatetimeIndex: 3 entries, 2022-01-01 01:00:00+01:00 to 2022-01-03 01:00:00+01:00*

Type Details
df DataFrame Dataframe to set or convert the timeindex on
timezone Timezone to set

Example

df = pd.DataFrame({'A': [1, 2, 3]}, index=pd.DatetimeIndex(['2022-01-01', '2022-01-02', '2022-01-03']))
set_time_index_zone(df, 'Europe/Berlin')
df.index
DatetimeIndex(['2022-01-01 01:00:00+01:00', '2022-01-02 01:00:00+01:00',
               '2022-01-03 01:00:00+01:00'],
              dtype='datetime64[ns, Europe/Berlin]', name='time', freq=None)

timeseries_dataframe

Converts Pandas dataframes and series, Numpy array’s and recarrays or a dictionary of individual timeseries into a Pandas dataframe with one datetime index. With all arrays dataframes and series it is assumed that the first column contains the timestamps.


source

timeseries_dataframe

 timeseries_dataframe (data:Union[pandas.core.frame.DataFrame,pandas.core.
                       series.Series,dict,numpy.ndarray,numpy.rec.recarray
                       ], timezone='UTC', columnnames=None)

*Convert various tabular data formats to timeseries DataFrame

Args: data (Union[pd.DataFrame, pd.Series, dict, np.ndarray, np.recarray]): The input data to be converted. timezone (str, optional): The timezone to set for the index of the DataFrame. Defaults to ‘UTC’. columnnames (Optional[List[str]]): The column names to use for the DataFrame. Defaults to None.

Returns: pd.DataFrame: The converted timeseries DataFrame with the index set to the specified timezone.*

timeseries_dataframe_from_datadict


source

timeseries_dataframe_from_datadict

 timeseries_dataframe_from_datadict (data:dict, timecolumns=None,
                                     recordformat='records')

Converts a data dict into a pandas DataFrame based on the specified record format. Parameters: - data: A dictionary containing the data to convert. - timecolumns: A list of column names to be treated as time columns. - recordformat: A string specifying the format of the data records (‘records’, ‘table’, ‘split’, ‘index’, ‘tight’). Returns: - df: A pandas DataFrame with a DatetimeIndex representing the converted data.

Exported source
def timeseries_dataframe_from_datadict(
        data:dict, 
        timecolumns=None,
        recordformat='records'):
        
    """
    Converts a data dict into a pandas DataFrame based on the specified record format. 
    Parameters:
        - data: A dictionary containing the data to convert.
        - timecolumns: A list of column names to be treated as time columns.
        - recordformat: A string specifying the format of the data records ('records', 'table', 'split', 'index', 'tight').
    Returns:
        - df: A pandas DataFrame with a DatetimeIndex representing the converted data.
    """

    orient = recordformat.lower()
    assert orient in ['records', 'table', 'split', 'index', 'tight']
    assert timecolumns, 'No time columns specified'

    if orient == 'records':
        # data is a structured ndarray, sequence of tuples or dicts, or DataFrame
        df = pd.DataFrame.from_records(data)
        time_columns_in_df = [C for C in df.columns if C in timecolumns]
        if not  time_columns_in_df:
            #syslog.error(f"No  column in records {df.columns} matches specification in time columns {timecolumns}, assuming first column is time")
            time_column = df.columns[0]
        else:
            time_column = time_columns_in_df[0]

    elif orient == 'table':
        # data is in pandas table format
        time_column = data['schema']['primaryKey'][0]
        df = pd.DataFrame.from_dict(data['data']).set_index(data['schema']['primaryKey'])
        df.index.name = 'time'
    else:
        # data  is formatted according to 'orient' parameter (pandas)
        df = pd.DataFrame.from_dict(data, orient=orient) # type: ignore
        time_column = df.index.name


    df.columns = list(df.columns)
    df[time_column] = pd.to_datetime(df[time_column],utc=True,format='ISO8601')
    df.set_index(time_column, inplace=True)
    df.index = pd.DatetimeIndex(df.index).round('ms')
    
    df.index.name = 'time'

    return df
df = timeseries_dataframe_from_datadict([
      {
         "time":"2023-05-04T10:04:49.000Z",
         "value":16.72
      },
      {
         "time":"2023-05-04T10:24:51.000Z",
         "value":16.65
      },
      {
         "time":"2023-05-04T10:44:53.000Z",
         "value":16.55
      }
   ], timecolumns=['time'])
df
value
time
2023-05-04 10:04:49+00:00 16.72
2023-05-04 10:24:51+00:00 16.65
2023-05-04 10:44:53+00:00 16.55
df.index
DatetimeIndex(['2023-05-04 10:04:49+00:00', '2023-05-04 10:24:51+00:00',
               '2023-05-04 10:44:53+00:00'],
              dtype='datetime64[ns, UTC]', name='time', freq=None)
df = set_time_index_zone( timeseries_dataframe_from_datadict([
      {
         "time":"2023-05-04T10:04:49",
         "value":16.72
      },
      {
         "time":"2023-05-04T10:24:51",
         "value":16.65
      },
      {
         "time":"2023-05-04T10:44:53",
         "value":16.55
      }
   ], timecolumns=['time']), timezone='Europe/Amsterdam')
df
value
time
2023-05-04 12:04:49+02:00 16.72
2023-05-04 12:24:51+02:00 16.65
2023-05-04 12:44:53+02:00 16.55
df.index
DatetimeIndex(['2023-05-04 12:04:49+02:00', '2023-05-04 12:24:51+02:00',
               '2023-05-04 12:44:53+02:00'],
              dtype='datetime64[ns, Europe/Amsterdam]', name='time', freq=None)