Core functionality

Fill in a module description here

source

init_console_logging

 init_console_logging (name=None, level=20, timestamp=True)

Setup none-blocking stream handler for sending loggin to the console.

Exported source
def init_console_logging(name=None, level=logging.INFO, timestamp=True):
    '''Setup none-blocking stream handler for sending loggin to the console.'''

    # Only if no handlers defined.
    if not logging.getLogger(name).handlers:

        logger = logging.getLogger()
        logger.setLevel(level)

        console = logging.StreamHandler()
        console.setLevel(level)

        # set a format which is simpler for console use
        if timestamp:
            formatter = logging.Formatter("%(asctime)s %(levelname)s\t%(process)d\t%(name)s\t%(filename)s\t%(lineno)d\t%(message)s", datefmt='%Y-%m-%dT%H:%M:%S%z')
        else:
            formatter = logging.Formatter("%(levelname)s\t%(process)d\t%(name)s\t%(filename)s\t%(lineno)d\t%(message)s")
            
        #formatter = logging.Formatter("%(asctime)s %(levelname)s\t%(process)d\t%(name)s\t%(filename)s\t%(lineno)d\t%(message)s", datefmt='%Y-%m-%dT%H:%M:%S%z')

        # tell the handler to use this format
        console.setFormatter(formatter)

        # add the handler to the root logger
        logger.addHandler(console)
        return logger
    else:
        logging.getLogger(name).info(f'There already is a logger installed for {name}.')

Timeseries dataframes

Timeseries data is a cornerstone of our data manipulation and most processing is on them

set_time_index_zone

Processing may depend on proper timezone awareness, this utility to set the timezone on a datetime index


source

set_time_index_zone

 set_time_index_zone (df:pandas.core.frame.DataFrame, timezone)

*Sets the time zone of the index of a pandas DataFrame.

Args: df (pd.DataFrame): The DataFrame whose index time zone is to be set. timezone (str): The desired time zone.

Returns: pd.DataFrame: The modified DataFrame with its index time zone set to the specified time zone.

Raises: None

Examples: >>> df = pd.DataFrame({‘A’: [1, 2, 3]}, index=pd.DatetimeIndex([‘2022-01-01’, ‘2022-01-02’, ‘2022-01-03’])) >>> set_time_index_zone(df, ‘Europe/Berlin’) A 2022-01-01 1 2022-01-02 2 2022-01-03 3 DatetimeIndex: 3 entries, 2022-01-01 01:00:00+01:00 to 2022-01-03 01:00:00+01:00*

Type Details
df DataFrame Dataframe to set or convert the timeindex on
timezone Timezone to set

Example

df = pd.DataFrame({'A': [1, 2, 3]}, index=pd.DatetimeIndex(['2022-01-01', '2022-01-02', '2022-01-03']))
set_time_index_zone(df, 'Europe/Berlin')
df.index
DatetimeIndex(['2022-01-01 01:00:00+01:00', '2022-01-02 01:00:00+01:00',
               '2022-01-03 01:00:00+01:00'],
              dtype='datetime64[ns, Europe/Berlin]', name='time', freq=None)

timeseries_dataframe

Converts Pandas dataframes and series, Numpy array’s and recarrays or a dictionary of individual timeseries into a Pandas dataframe with one datetime index. With all arrays dataframes and series it is assumed that the first column contains the timestamps.


source

timeseries_dataframe

 timeseries_dataframe (data:Union[pandas.core.frame.DataFrame,pandas.core.
                       series.Series,dict,numpy.ndarray,numpy.rec.recarray
                       ], timezone='UTC', columnnames=None)

*Convert various tabular data formats to timeseries DataFrame

Args: data (Union[pd.DataFrame, pd.Series, dict, np.ndarray, np.recarray]): The input data to be converted. timezone (str, optional): The timezone to set for the index of the DataFrame. Defaults to ‘UTC’. columnnames (Optional[List[str]]): The column names to use for the DataFrame. Defaults to None.

Returns: pd.DataFrame: The converted timeseries DataFrame with the index set to the specified timezone.*

timeseries_dataframe_from_datadict


source

timeseries_dataframe_from_datadict

 timeseries_dataframe_from_datadict (data:dict, timecolumns=None,
                                     recordformat='records')

Converts a data dict into a pandas DataFrame based on the specified record format. Parameters: - data: A dictionary containing the data to convert. - timecolumns: A list of column names to be treated as time columns. - recordformat: A string specifying the format of the data records (‘records’, ‘table’, ‘split’, ‘index’, ‘tight’). Returns: - df: A pandas DataFrame with a DatetimeIndex representing the converted data.

Exported source
def timeseries_dataframe_from_datadict(
        data:dict, 
        timecolumns=None,
        recordformat='records'):
        
    """
    Converts a data dict into a pandas DataFrame based on the specified record format. 
    Parameters:
        - data: A dictionary containing the data to convert.
        - timecolumns: A list of column names to be treated as time columns.
        - recordformat: A string specifying the format of the data records ('records', 'table', 'split', 'index', 'tight').
    Returns:
        - df: A pandas DataFrame with a DatetimeIndex representing the converted data.
    """

    orient = recordformat.lower()
    assert orient in ['records', 'table', 'split', 'index', 'tight']
    assert timecolumns, 'No time columns specified'

    if orient == 'records':
        # data is a structured ndarray, sequence of tuples or dicts, or DataFrame
        df = pd.DataFrame.from_records(data)
        time_columns_in_df = [C for C in df.columns if C in timecolumns]
        if not  time_columns_in_df:
            #syslog.error(f"No  column in records {df.columns} matches specification in time columns {timecolumns}, assuming first column is time")
            time_column = df.columns[0]
        else:
            time_column = time_columns_in_df[0]

    elif orient == 'table':
        # data is in pandas table format
        time_column = data['schema']['primaryKey'][0]
        df = pd.DataFrame.from_dict(data['data']).set_index(data['schema']['primaryKey'])
        df.index.name = 'time'
    else:
        # data  is formatted according to 'orient' parameter (pandas)
        df = pd.DataFrame.from_dict(data, orient=orient)
        time_column = df.index.name


    df.columns = list(df.columns)
    df[time_column] = pd.to_datetime(df[time_column],utc=True,format='ISO8601')
    df.set_index(time_column, inplace=True)
    df.index = pd.DatetimeIndex(df.index).round('ms')
    
    df.index.name = 'time'

    return df
df = timeseries_dataframe_from_datadict([
      {
         "time":"2023-05-04T10:04:49.000Z",
         "value":16.72
      },
      {
         "time":"2023-05-04T10:24:51.000Z",
         "value":16.65
      },
      {
         "time":"2023-05-04T10:44:53.000Z",
         "value":16.55
      }
   ], timecolumns=['time'])
df
value
time
2023-05-04 10:04:49+00:00 16.72
2023-05-04 10:24:51+00:00 16.65
2023-05-04 10:44:53+00:00 16.55
df.index
DatetimeIndex(['2023-05-04 10:04:49+00:00', '2023-05-04 10:24:51+00:00',
               '2023-05-04 10:44:53+00:00'],
              dtype='datetime64[ns, UTC]', name='time', freq=None)
df = set_time_index_zone( timeseries_dataframe_from_datadict([
      {
         "time":"2023-05-04T10:04:49",
         "value":16.72
      },
      {
         "time":"2023-05-04T10:24:51",
         "value":16.65
      },
      {
         "time":"2023-05-04T10:44:53",
         "value":16.55
      }
   ], timecolumns=['time']), timezone='Europe/Amsterdam')
df
value
time
2023-05-04 12:04:49+02:00 16.72
2023-05-04 12:24:51+02:00 16.65
2023-05-04 12:44:53+02:00 16.55
df.index
DatetimeIndex(['2023-05-04 12:04:49+02:00', '2023-05-04 12:24:51+02:00',
               '2023-05-04 12:44:53+02:00'],
              dtype='datetime64[ns, Europe/Amsterdam]', name='time', freq=None)

Handling NaN values in the output


source

pop_nan_values

 pop_nan_values (data)

*Recursively pop keys with nan values from dict or lists with dicts.

Args: data (Union[list, dict]): The data to be processed.

Returns: Union[list, dict]: The processed data with keys with nan values removed.*

Timezones

rng = pd.date_range(pd.Timestamp("2018-04-10T09:01:01.123+02:00"), periods=3, freq='s').tz_convert('Europe/Amsterdam')
rng
DatetimeIndex(['2018-04-10 09:01:01.123000+02:00',
               '2018-04-10 09:01:02.123000+02:00',
               '2018-04-10 09:01:03.123000+02:00'],
              dtype='datetime64[ns, Europe/Amsterdam]', freq='s')
rng.strftime("%FT%R:%S%z")
Index(['2018-04-10T09:01:01+0200', '2018-04-10T09:01:02+0200',
       '2018-04-10T09:01:03+0200'],
      dtype='object')
pd.DatetimeIndex(rng.strftime("%FT%R:%S%z")).round('ms')
DatetimeIndex(['2018-04-10 09:01:01+02:00', '2018-04-10 09:01:02+02:00',
               '2018-04-10 09:01:03+02:00'],
              dtype='datetime64[ns, UTC+02:00]', freq=None)
rng.tz_convert('UTC').strftime("%FT%R:%SZ")
Index(['2018-04-10T07:01:01Z', '2018-04-10T07:01:02Z', '2018-04-10T07:01:03Z'], dtype='object')
# .map(lambda x: x.isoformat())
rng = pd.date_range(pd.Timestamp("2018-04-10T09:01:01.123+02:00"), periods=30000, freq='s').tz_convert('Europe/Amsterdam')

Which is faster, strftime() or isoformat()?

ft = rng.strftime("%FT%R:%S%z")
116 ms ± 1.28 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
ft = rng.map(lambda x: x.isoformat(timespec='milliseconds'))
76.5 ms ± 338 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

source

timeseries_dataframe_to_datadict

 timeseries_dataframe_to_datadict
                                   (data:Union[pandas.core.frame.DataFrame
                                   ,pandas.core.series.Series,dict],
                                   recordformat:str='records',
                                   timezone:str='UTC', popNaN:bool=False)

*Convert a timeseries DataFrame or Series into a dictionary representation.

Args: data (Union[pd.DataFrame, pd.Series, dict]): The input data to be converted. It can be a pandas DataFrame, Series, or a dictionary. recordformat (str, optional): The format of the output records. Defaults to ‘records’. timezone (str, optional): The timezone to use for the DataFrame index. Defaults to ‘UTC’. popNaN (bool, optional): Whether to remove NaN values from the output dictionary. Defaults to False.

Returns: Union[dict, list]: The converted dictionary representation of the input data. If popNaN is True, it returns a dictionary with NaN values removed. Otherwise, it returns a dictionary or a list of dictionaries depending on the recordformat parameter.*

df = timeseries_dataframe_from_datadict([
      {
         "time":"2023-05-04T10:04:49.050+01:00",
         "value":16.72
      },
      {
         "time":"2023-05-04T10:24:51.010Z",
         "value":16.65
      },
      {
         "time":"2023-05-04T10:44:53.000Z",
         "value":16.55
      },
      {
         "time":"2023-05-04T10:44:53.000Z",
         "value":np.nan
      }
   ], timecolumns=['time'])

df
value
time
2023-05-04 09:04:49.050000+00:00 16.72
2023-05-04 10:24:51.010000+00:00 16.65
2023-05-04 10:44:53+00:00 16.55
2023-05-04 10:44:53+00:00 NaN
df.index
DatetimeIndex(['2023-05-04 09:04:49.050000+00:00',
               '2023-05-04 10:24:51.010000+00:00',
                      '2023-05-04 10:44:53+00:00',
                      '2023-05-04 10:44:53+00:00'],
              dtype='datetime64[ns, UTC]', name='time', freq=None)
df = timeseries_dataframe_from_datadict([
      {
         "time":"2023-05-04T10:04:49.050+01:00",
         "value":16.72
      },
      {
         "time":"2023-05-04T10:24:51.010Z",
         "value":16.65
      },
      {
         "time":"2023-05-04T10:44:53.000Z",
         "value":16.55
      },
      {
         "time":"2023-05-04T10:44:53.000Z",
         "value":np.nan
      }
   ], timecolumns=['time'])

df
value
time
2023-05-04 09:04:49.050000+00:00 16.72
2023-05-04 10:24:51.010000+00:00 16.65
2023-05-04 10:44:53+00:00 16.55
2023-05-04 10:44:53+00:00 NaN
timeseries_dataframe(df, timezone='UTC').index
DatetimeIndex(['2023-05-04 09:04:49.050000+00:00',
               '2023-05-04 10:24:51.010000+00:00',
                      '2023-05-04 10:44:53+00:00',
                      '2023-05-04 10:44:53+00:00'],
              dtype='datetime64[ns, UTC]', name='time', freq=None)
df = timeseries_dataframe_from_datadict([
      {
         "time":"2023-05-04T10:04:49.050+01:00",
         "value":16.72
      },
      {
         "time":"2023-05-04T10:24:51.010Z",
         "value":16.65
      },
      {
         "time":"2023-05-04T10:44:53.000Z",
         "value":16.55
      },
      {
         "time":"2023-05-04T10:44:53.000Z",
         "value":np.nan
      }
   ], timecolumns=['time'])

df
value
time
2023-05-04 09:04:49.050000+00:00 16.72
2023-05-04 10:24:51.010000+00:00 16.65
2023-05-04 10:44:53+00:00 16.55
2023-05-04 10:44:53+00:00 NaN
timeseries_dataframe_to_datadict(df, recordformat='records', popNaN=True)
Normalized, UTC
[{'time': '2023-05-04T09:04:49Z', 'value': 16.72},
 {'time': '2023-05-04T10:24:51Z', 'value': 16.65},
 {'time': '2023-05-04T10:44:53Z', 'value': 16.55},
 {'time': '2023-05-04T10:44:53Z'}]
df = timeseries_dataframe_from_datadict([
      {
         "time":"2023-05-04T10:04:49.050+01:00",
         "value":16.72
      },
      {
         "time":"2023-05-04T10:24:51.010Z",
         "value":16.65
      },
      {
         "time":"2023-05-04T10:44:53.000Z",
         "value":16.55
      },
      {
         "time":"2023-05-04T10:44:53.000Z",
         "value":np.nan
      }
   ], timecolumns=['time'])

df
value
time
2023-05-04 09:04:49.050000+00:00 16.72
2023-05-04 10:24:51.010000+00:00 16.65
2023-05-04 10:44:53+00:00 16.55
2023-05-04 10:44:53+00:00 NaN
timeseries_dataframe_to_datadict(df, recordformat='records', popNaN=True, timezone='Europe/Berlin')
Normalized, Europe/Berlin
[{'time': '2023-05-04T11:04:49.050+02:00', 'value': 16.72},
 {'time': '2023-05-04T12:24:51.010+02:00', 'value': 16.65},
 {'time': '2023-05-04T12:44:53.000+02:00', 'value': 16.55},
 {'time': '2023-05-04T12:44:53.000+02:00'}]
df.index
DatetimeIndex(['2023-05-04 09:04:49.050000+00:00',
               '2023-05-04 10:24:51.010000+00:00',
                      '2023-05-04 10:44:53+00:00',
                      '2023-05-04 10:44:53+00:00'],
              dtype='datetime64[ns, UTC]', name='time', freq=None)
df = timeseries_dataframe_from_datadict([
      {
         "time":"2023-05-04T10:04:49.050+01:00",
         "value":16.72
      },
      {
         "time":"2023-05-04T10:24:51.010Z",
         "value":16.65
      },
      {
         "time":"2023-05-04T10:44:53.000Z",
         "value":16.55
      },
      {
         "time":"2023-05-04T10:44:53.000Z",
         "value":np.nan
      }
   ], timecolumns=['time'])

df
value
time
2023-05-04 09:04:49.050000+00:00 16.72
2023-05-04 10:24:51.010000+00:00 16.65
2023-05-04 10:44:53+00:00 16.55
2023-05-04 10:44:53+00:00 NaN
timeseries_dataframe_to_datadict(df, recordformat='tight', popNaN=True)
Normalized, UTC
{'index': ['2023-05-04T09:04:49Z',
  '2023-05-04T10:24:51Z',
  '2023-05-04T10:44:53Z',
  '2023-05-04T10:44:53Z'],
 'columns': ['value'],
 'data': [[16.72], [16.65], [16.55]],
 'index_names': ['time']}
test_data = {'index': ['2023-05-04T10:04:49+00:00',
  '2023-05-04T10:24:51+00:00',
  '2023-05-04T10:44:53+00:00',
  '2023-05-04T10:44:53+00:00'],
 'columns': ['value'],
 'data': [[16.72], [16.65], [16.55], [np.nan]],
 'index_names': ['time'],
 'column_names': [None]}
pop_nan_values(test_data)
{'index': ['2023-05-04T10:04:49+00:00',
  '2023-05-04T10:24:51+00:00',
  '2023-05-04T10:44:53+00:00',
  '2023-05-04T10:44:53+00:00'],
 'columns': ['value'],
 'data': [[16.72], [16.65], [16.55]],
 'index_names': ['time']}
pop_nan_values(test_data)
73.3 µs ± 314 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)

source

timeseries_dataframe_resample

 timeseries_dataframe_resample (df:pandas.core.frame.DataFrame,
                                period:str, method:str)

*Resamples a time-series DataFrame on the specified period and method.

Parameters: df (pd.DataFrame): The input time-series DataFrame. period (str): The resampling period. method (str): The resampling method. Can be a string of multiple methods separated by ‘;’. method_args (dict, optional): Additional arguments for the resampling method.

Returns: pd.DataFrame: The resampled DataFrame.*

df = timeseries_dataframe_from_datadict([
      {
         "time":"2023-05-04T10:04:49.000Z",
         "value":16.72
      },
      {
         "time":"2023-05-04T10:24:51.000Z",
         "value":16.65
      },
      {
         "time":"2023-05-04T10:44:53.000Z",
         "value":16.55
      },
      {
         "time":"2023-05-04T11:04:49.000Z",
         "value":16.47
      },
      {
         "time":"2023-05-04T11:24:51.000Z",
         "value":16.44
      },
      {
         "time":"2023-05-04T11:44:53.000Z",
         "value":16.38
      },
   ], timecolumns=['time'])
timeseries_dataframe_resample(df, "80min", 'mean;count')
value value_mean value_count
time
2023-05-04 09:20:00+00:00 NaN 16.685 2.0
2023-05-04 10:04:49+00:00 16.72 NaN NaN
2023-05-04 10:24:51+00:00 16.65 NaN NaN
2023-05-04 10:40:00+00:00 NaN 16.460 4.0
2023-05-04 10:44:53+00:00 16.55 NaN NaN
2023-05-04 11:04:49+00:00 16.47 NaN NaN
2023-05-04 11:24:51+00:00 16.44 NaN NaN
2023-05-04 11:44:53+00:00 16.38 NaN NaN

AICoreModuleBase


source

AICoreModuleBase

 AICoreModuleBase ()

Initialize self. See help(type(self)) for accurate signature.

Exported source
class AICoreModuleBase:
    pass

source

AICoreModuleBase.__init__

 AICoreModuleBase.__init__ (save_dir:str, assets_dir:str, *args, **kwargs)

Initialize self. See help(type(self)) for accurate signature.

Type Details
save_dir str path where the module can keep files
assets_dir str path to support files (scripts, metadata, etc)
args VAR_POSITIONAL
kwargs VAR_KEYWORD
Exported source
@patch
def __init__(
    self:AICoreModuleBase, 
    save_dir:str, # path where the module can keep files 
    assets_dir:str, # path to support files (scripts, metadata, etc)
    *args, **kwargs
    ):
    
    self.init_time = datetime.datetime.now(datetime.UTC)
    self.aicorebridge_version = __version__

    self.init_args = args
    self.init_kwargs = dict(
        **kwargs,
        assets_dir=assets_dir,
        save_dir=save_dir
    )
save_dir = os.path.join(os.getcwd(), 'cache')
test_module = AICoreModuleBase(os.path.join(os.getcwd(), 'cache'), None, 1, 2, num_1=3, num_2=4)

assert test_module.init_args == (1, 2), "init_args should be (1, 2)"
assert test_module.init_kwargs['num_1'] == 3, "init_kwargs['num_1'] should be 3"
assert test_module.init_kwargs['num_2'] == 4, "init_kwargs['num_2'] should be 4"
assert test_module.init_kwargs['save_dir'] == save_dir, f"init_kwargs['save_dir'] should be {save_dir}"
test_module.__dict__
{'init_time': datetime.datetime(2025, 1, 27, 9, 12, 35, 566628, tzinfo=datetime.timezone.utc),
 'aicorebridge_version': '0.4.0',
 'init_args': (1, 2),
 'init_kwargs': {'num_1': 3,
  'num_2': 4,
  'assets_dir': None,
  'save_dir': '/home/fenke/repos/corebridge/nbs/cache'}}