import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from math import sqrt
from tslearn.preprocessing import TimeSeriesScalerMeanVariance
date_parser = pd.to_datetime
[docs]class SeriesSupp:
"""Premet d'organiser et de manipuler les données.
Parameters:
* cwd: String
chemin d'acces ou le main est excecute
* factory: Factory
Instance de la factory
* dataset_name: String
Definie le type de donnees a recuperer
Variables:
* dataset: {Dict}
Le dataset original sans modification
* tmp_dataset: {Dict}
Le dataset actuel avec les modification
* years: [ARRAY<STRING>]
Setup de decoupage par annees
* months: [ARRAY<STRING>]
Setup de decoupage par mois
* days: [ARRAY<STRING>]
Setup de decoupage par annees | [BOOL] decoupage semaines
* factory: DataFactory
Instance de la Factory
* dataset_name: String
Permet de connaitre la source souhaite e.g import_dataset()
"""
def __init__(self, cwd, factory, dataset_name):
self.cwd = cwd
self.dataset = {}
self.tmp_dataset = {}
self.years = []
self.months = []
self.days = False
self.factory = factory
self.norm = False
self.rounded = False
self.smoothed = False
self.dataset_name = dataset_name
self.reset_years()
def __repr__(self):
""" Representation de l'instance """
return str("Dataset: " + str(self.dataset_name) + ". De taille source: " + str(len(self.dataset)) + ". et de taille current: " + str(len(self.tmp_dataset)))
[docs] def reset_years(self):
"""Par defaut decoupe les TS dans la granularite maximale"""
self.years = [2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015]
[docs] def reset_months(self):
"""Par defaut decoupe les TS dans la granularite maximale"""
self.months = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
[docs] def reset_days(self):
"""Par defaut decoupe les TS dans la granularite maximale"""
self.days = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]
[docs] def reset_dataset(self):
""" Retourne aux donnees importes avant modifications """
reset = self.dataset.copy()
self.tmp_dataset = reset
self.norm = False
self.days = False
[docs] def reset_setup(self):
""" Full reset des variables de granularites """
self.reset_years()
self.reset_months()
self.reset_days()
[docs] def info(self):
""" Permet d'avoir une idees des donnees du dataset en prenant une TS au hasard"""
k, v = next(iter(self.dataset.items()))
print("Taille du dataset dictionnaire: " + str(len(self.dataset)))
print("Capteur: " + k)
print(v.info())
print(v.head())
print(v.tail())
[docs] def get_data(self):
""" Getter du dataset modifie """
return self.tmp_dataset.copy()
[docs] def import_dataset(self):
""" Appel a la factory pour recuperer les donnees """
self.dataset = self.factory.get_data(self.dataset_name)
self.reset_dataset()
[docs] def smooth(self, data, wind, col):
"""
Smooth via rolling window
Parameters:
* data: DataFrame
La DF a smooth, attention a bien choisir la colonne voulu
* wind: int
Taille de fentre
* col: String
La colonne de la DF a smooth
Returns:
data: DataFrame
La DF remanie
"""
data[col] = data[col].rolling(window = wind, center = True).mean()
data = data.drop(list(range(len(data) - wind,len(data))))
data = data.drop(list(range(wind)))
data.reset_index()
return data
[docs] def dict_smooth(self, wind = 24, col = "Valeur"):
"""
Dictionnaire full smooth
Parameters:
* wind: int
Taille de la fenetre 24 pour smooth journalier
* col: str (DEFAUlT = "Valeur")
Colonne cible de l'action
Returns:
NA
"""
res = {}
tampon = self.tmp_dataset.copy()
for k, v in tampon.items():
tampon_v = v.copy()
res[k] = self.smooth(data = tampon_v, wind = wind, col = col)
self.tmp_dataset = res
self.smoothed = True
[docs] def normalize(self, data):
"""
Normalisation des TS, moyenne: 0 et ecart type: 1
Data: dataframe
"""
# prepare data for standardization
values = data.loc[:,("Valeur")]
values_r = values.values.reshape((len(values), 1))
# train the standardization
scaler = StandardScaler()
scaler = scaler.fit(values_r)
tr_v = scaler.transform(values_r)
data["Valeur"] = tr_v
return data
[docs] def standardize(self, data):
"""
Standardize des TS, moyenne: 0 et ecart type: 1
Data: dataframe
"""
# prepare data for standardization
values = data["Valeur"]
values = values.values.reshape((len(values), 1))
# train the standardization
t = TimeSeriesScalerMeanVariance().fit_transform(values)
print(t)
data["valeur"] = TimeSeriesScalerMeanVariance().fit_transform(values)
return data
[docs] def dict_round(self):
""" Normalise un dictionnaire de TS """
for k, v in self.tmp_dataset.items():
#print(v.shape)
v = self.rounding(v)
self.rounded = True
[docs] def dict_norm(self):
""" Normalise un dictionnaire de TS """
for k, v in self.tmp_dataset.items():
#print(v.shape)
v = self.normalize(v)
self.norm = True
[docs] def dict_stand(self):
""" Normalise un dictionnaire de TS """
for k, v in self.tmp_dataset.items():
#print(v.shape)
v = self.standardize(v)
self.norm = True
[docs] def split_data_years(self):
""" Decoupage des TS selon la variable d'annees """
res = {}
for y in self.years:
for k, v in self.tmp_dataset.items():
v = v.set_index("Date")
try:
tmp = v[v.index.year == y]
v = v.reset_index()
tmp = tmp.reset_index()
if tmp.shape[0] != 0:
res[str(k) + "_" + str(y) ] = tmp
except KeyError:
print(k + ": pas de données en " + str(y))
if len(res) != 0:
self.tmp_dataset = res
[docs] def split_data_months(self):
""" Decoupage des TS selon la variable de mois """
res = {}
for m in self.months:
for k, v in self.tmp_dataset.items():
v = v.set_index("Date")
try:
tmp = v[v.index.month == m]
v = v.reset_index()
tmp = tmp.reset_index()
if tmp.shape[0] != 0:
if len(str(m)) == 1:
ret_m = "0"+str(m)
else:
ret_m = m
res[str(k) + "_" + str(ret_m) ] = tmp
except KeyError:
print(k + ": pas de données en " + str(m))
if len(res) != 0:
self.tmp_dataset = res
[docs] def split_data_weeks(self):
""" Decoupage des TS selon les semaines"""
res = {}
for k, v in self.tmp_dataset.items():
tmp = v.groupby(pd.Grouper(key='Date', freq='W'))
for i in tmp:
if i[1].shape[0] != 0:
if len(str(i[0].week)) == 1:
ret_d = "0"+str(i[0].week)
else:
ret_d = i[0].week
res[str(k) + "_" + str(ret_d) ] = i[1]
if len(res) != 0:
self.tmp_dataset = res
self.days = True
[docs] def split_all(self):
self.split_data_years()
self.split_data_months()
if self.days:
self.split_data_weeks()
[docs] def get_data_from_captor(self, cpt):
"""
Retourne toutes les series temporelles liees a un nom de capteur
Parameters:
* cpt: String
Nom du capteur desire
Returns:
res: {Dict}
Sous dataset du capteur associe, les clefs sont les differente declinaisons en series temporelles du capteur, varie en longueur selon la granularite
"""
res = {}
for k, v in self.tmp_dataset.items():
if cpt in k:
res[k] = v
return res
################################################# For precise split where each captor don't compare to itself #################################################
[docs] def split_year_multi_month(self):
res = {}
for k, v in self.tmp_dataset.items():
v = v.set_index("Date")
tmp = pd.DataFrame({'A' : []})
try:
for m in self.months:
if tmp.empty:
tmp = v[self.year +"-"+ m]
else:
tmp = pd.concat([tmp, v[self.year +"-"+ m]])
tmp = tmp.reset_index()
res[k] = tmp
except KeyError:
print(k + ": pas de données en " + self.year +"-"+ str(m))
self.tmp_dataset = res
[docs] def split_year_month_multi_day(self, dataset, month):
res = {}
for k, v in dataset.items():
v = v.set_index("Date")
tmp = pd.DataFrame({'A' : []})
try:
for d in self.days:
if tmp.empty:
tmp = v[self.year +"-"+ month + "-" + d]
else:
tmp = pd.concat([tmp, v[self.year +"-"+ month + "-" + d]])
tmp = tmp.reset_index()
res[k] = tmp
except KeyError:
print(k + ": pas de données en " + self.year +"-"+ month + "-" + d)
return res
[docs] def split_year_multi_month_multi_day(self):
res = {}
for k, v in self.tmp_dataset.items():
v = v.set_index("Date")
tmp = pd.DataFrame({'A' : []})
for m in self.months:
for d in self.days:
if tmp.empty:
try:
tmp = v[str(self.year) +"-"+ m + "-" + d]
except KeyError:
print(k + ": pas de données en " + self.year +"-"+ m +"-"+ d)
else:
try:
tmp = pd.concat([tmp, v[str(self.year) +"-"+ m +"-"+ d]])
except KeyError:
print(k + ": pas de données en " + self.year +"-"+ m +"-"+ d)
tmp = tmp.reset_index()
res[k] = tmp
self.tmp_dataset = res
[docs] def split_each_steps(self):
res = {}
if not self.months:
self.reset_months()
if not self.days:
self.reset_days()
for k, v in self.tmp_dataset.items():
v = v.set_index("Date")
tmp = pd.DataFrame({'A' : []})
for y in self.years:
for m in self.months:
for d in self.days:
pass