Untitled

                Never    
import featuretools as ft
import pandas as pd
from IPython.display import display

pd.options.display.max_columns = 200
pd.options.display.max_rows = 200

# make a dict name_of_table : table_pandas

import os

list_names = ['BEHAVIOSEC_header', 'BEHAVIOSEC_ACC', 'BEHAVIOSEC_GYRO', 'BEHAVIOSEC_header', 'BEHAVIOSEC_MOTION', 'BEHAVIOSEC_USAGE', 'HCL_ambientBrightness', 'HCL_appData', 'HCL_gravity', 'HCL_magneticField']
dict_name_pd = {}

for name in list_names:
    dict_name_pd[name] = pd.read_pickle(os.path.join(r"C:\Users\admin\PycharmProjects\ActiveAuth\008", str(name) + '.pkl'))

for key, pd_data in dict_name_pd.items():
    print(key)
    display(pd_data.head())
    print(pd_data.dtypes)

# drop columns and rename acc and gyro (preprocessing)

dict_name_pd['BEHAVIOSEC_ACC'].rename(columns={'xValue':'xValue_acc', 'yValue':'yValue_acc', 'zValue':'zValue_acc'}, inplace=True)
dict_name_pd['BEHAVIOSEC_ACC'].drop(columns=['eventType', 'tag'], inplace=True)
dict_name_pd['BEHAVIOSEC_GYRO'].rename(columns={'xValue':'xValue_gyro', 'yValue':'yValue_gyro', 'zValue':'zValue_gyro'}, inplace=True)
dict_name_pd['BEHAVIOSEC_GYRO'].drop(columns=['eventType', 'tag'], inplace=True)

# dict_name_pd['BEHAVIOSEC_ACC'].columns

# BEHAVIOSEC_ACC + BEHAVIOSEC_GYRO
data1 = pd.concat([dict_name_pd['BEHAVIOSEC_ACC'], dict_name_pd['BEHAVIOSEC_GYRO']], sort=False)
data1.head()

dict_name_pd['BEHAVIOSEC_MOTION']['majorAxis'] = dict_name_pd['BEHAVIOSEC_MOTION']['majorAxis'].replace('-', pd.np.nan)
dict_name_pd['BEHAVIOSEC_MOTION']['minorAxis'] = dict_name_pd['BEHAVIOSEC_MOTION']['minorAxis'].replace('-', pd.np.nan)

dict_name_pd['BEHAVIOSEC_MOTION']['majorAxis'] = dict_name_pd['BEHAVIOSEC_MOTION']['majorAxis'].astype(float)
dict_name_pd['BEHAVIOSEC_MOTION']['minorAxis'] = dict_name_pd['BEHAVIOSEC_MOTION']['minorAxis'].astype(float)

# drop columns usage and gyro (preprocessing)  -- not-renaming: if not need after, delete this rows in code

# dict_name_pd['BEHAVIOSEC_USAGE'].rename(columns={'xValue':'xValue_acc', 'yValue':'yValue_acc', 'zValue':'zValue_acc'}, inplace=True)
dict_name_pd['BEHAVIOSEC_USAGE'].drop(columns=['hash', 'tag'], inplace=True)
# dict_name_pd['BEHAVIOSEC_GYRO'].rename(columns={'xValue':'xValue_gyro', 'yValue':'yValue_gyro', 'zValue':'zValue_gyro'}, inplace=True)
dict_name_pd['BEHAVIOSEC_MOTION'].drop(columns=['tag', 'eventType'], inplace=True)

if 'uiElementType' in dict_name_pd['BEHAVIOSEC_MOTION']:
    dict_name_pd['BEHAVIOSEC_MOTION'].drop(columns=['uiElementType'], inplace=True)

# MOTION + BEHAVIOSEC_USAGE
data2 = pd.concat([dict_name_pd['BEHAVIOSEC_MOTION'], dict_name_pd['BEHAVIOSEC_USAGE']], sort=False)

difference = {}

for session_id in data2.SESSION.unique():
    df = dict_name_pd['BEHAVIOSEC_header']
    time, eventTime = map(int, df[df['SESSION'] == session_id].time.iloc[0].split(' '))
    difference[session_id] = (time - eventTime // 1000) * 1_000_000_000
    
data2['eventTimeHardware'] = data2.apply(lambda x: difference[x.SESSION] + x.eventTime, axis=1)

dict_name_pd['HCL_magneticField'].rename(columns={'epoc_time_ns_hardware':'eventTimeHardware', 'x':'x_mf', 'y':'y_mf', 'z':'z_mf'}, inplace=True)
dict_name_pd['HCL_ambientBrightness'].rename(columns={'epoc_time_ns_hardware':'eventTimeHardware', 'value':'value_aB'}, inplace=True)
dict_name_pd['HCL_gravity'].rename(columns={'epoc_time_ns_hardware':'eventTimeHardware', 'x':'x_gr', 'y':'y_gr', 'z':'z_gr'}, inplace=True)

data3 = pd.concat([dict_name_pd['HCL_magneticField'], dict_name_pd['HCL_ambientBrightness'], dict_name_pd['HCL_gravity']], sort=False)
# data3['eventTimeHardware'] = data3['eventTimeHardware'] // 1000000

display(data1.head())
display(data2.head())
display(data3.head())

data = pd.concat([data1, data2, data3], sort=False)

data.head()

# создаем новый столбец, в котором значения это EventTimeHardware // 15 секунд
# делаем groupBy по USER, SESSION и НОВОМУ СТОЛБЦУ

data.sort_values(by=['eventTimeHardware'], inplace=True)

data.fillna(method='ffill', inplace=True)
# data.fillna(method='backfill', inplace=True)

data['timeDividedBy15'] = data['eventTimeHardware'] // (15 * (10**9)) * (15 * (10**9))

# время записано в наносекундах

data.drop(columns=['eventTimeHardware', 'eventTime'], inplace=True)
data.head()

features = ['xValue_acc', 'yValue_acc',
       'zValue_acc', 'xValue_gyro', 'yValue_gyro', 'zValue_gyro',
       'eventPressure', 'majorAxis', 'minorAxis', 'positionX',
       'positionY', 'cpuUsage', 'memUsage', 'x_mf', 'y_mf', 'z_mf', 'value_aB',
       'x_gr', 'y_gr', 'z_gr']

agg = {
        f: [
            pd.Series.mean, pd.Series.sum, pd.Series.min, pd.Series.max,
            pd.Series.std, pd.Series.skew
        ]
        for f in features
    }

# agg['TARGET'] = pd.Series.min

data = data.groupby(['USER', 'SESSION', 'timeDividedBy15']).agg(agg)
# valid = valid.groupby('CLIENT_REG_CODE').agg(agg)
# valid.columns = ['_'.join(col) for col in valid.columns]

data.columns = ['_'.join(col) for col in data.columns]

data.dropna(how='any', inplace=True)

data = data.reset_index(level=['USER', 'SESSION', 'timeDividedBy15'])
data.head()

data_app = dict_name_pd['HCL_appData'].copy()
data_app.head()

data_app['timeDividedBy15'] = data_app['appForegroundStartEpoch'] * 1_000_000 // (15 * (10**9)) * (15 * (10**9))
# data_app['END'] = data_app['appForegroundEndEpoch'] * 1_000_000 // (15 * (10**9)) * (15 * (10**9))

# data_app['timeDividedBy15'].unique().shape
data_app.drop(columns=list(set(data_app.columns) - set(['SESSION', 'USER', 'timeDividedBy15', 'appName'])), inplace=True)

data_app = data_app[data_app['timeDividedBy15'] != 0]

final_data = data.merge(data_app, how='outer', on=['USER', 'SESSION', 'timeDividedBy15'])

final_data.sort_values(by=['timeDividedBy15'], inplace=True)
final_data['appName'].fillna(method='ffill', inplace=True)

final_data.to_pickle('final_008.pkl')

Raw Text