blob: b48f8bf71fe02bb16dacc1fcfa9a43c748cf6b07 [file] [log] [blame]
# ==================================================================================
# Copyright (c) 2020 HCL Technologies Limited.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==================================================================================
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import Normalizer
class PREPROCESS(object):
r""" This PREPROCESS class takes raw data and apply prepocessing on to that.
Parameters
----------
data: pandas dataframe
input dataset to process in pandas dataframe
Attributes
----------
data: DataFrame
DataFrame that has processed data
temp: list
list of attributes to drop
"""
def __init__(self, data):
"""
Columns that are not useful for the prediction will be dropped(UEID, Category, & Timestamp)
"""
self.data = data
self.convert_gb_to_mb()
def variation(self):
""" drop the constant parameters """
if len(self.data) > 1:
self.data = self.data.loc[:, self.data.apply(pd.Series.nunique) != 1]
def convert_gb_to_mb(self):
self.data.iloc[:]['DRB.UEThpDl'] = self.data['DRB.UEThpDl'].apply(lambda x: x*1024)
def numerical_data(self):
""" Filters only numeric data types """
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
self.data = self.data.select_dtypes(include=numerics)
def drop_na(self):
""" drop observations having nan values """
self.data = self.data.dropna(axis=0)
def correlation(self):
""" check and drop high correlation parameters """
corr = self.data.corr().abs()
corr = pd.DataFrame(np.tril(corr, k=-1), columns=self.data.columns)
drop = [column for column in corr.columns if any(corr[column] > 0.98)]
self.data = self.data.drop(drop, axis=1)
# check skewness of all parameters and use log transform if half of parameters are enough skewd
# otherwise use standardization
def fit_transform(self):
""" use normalizer transformation to bring all parameters in same scale """
scale = Normalizer().fit(self.data)
joblib.dump(scale, 'src/scale')
def transform(self):
scale = joblib.load('src/scale')
self.data = pd.DataFrame(scale.transform(self.data), columns=self.data.columns)
def save_cols(self):
joblib.dump(self.data.columns, 'src/num_params')
def process(self):
"""
Calls the modules for the data preprocessing like dropping columns, normalization etc.,
"""
temp = []
for col in self.data.columns:
if 'nb' in col or 'Geo' in col or 'anomal' in col or 'target' in col:
temp.append(col)
self.data = self.data.drop(temp, axis=1)
self.numerical_data()
self.drop_na()
self.variation()
self.correlation()
self.fit_transform()
self.transform()
self.save_cols()