Skip to content
Snippets Groups Projects
Commit 249b19a5 authored by Sergei Panarin's avatar Sergei Panarin
Browse files

added regression calculation, data filtering function, modified genre...

added regression calculation, data filtering function, modified genre aggregation to dictionary form
parent a6067192
No related branches found
No related tags found
No related merge requests found
......@@ -32,7 +32,7 @@ def read_data():
# @param: pandas dataframe with the full data
# @return: pandas dataframe with developer, publisher, genres columns encoded with integer IDs
def label_encoding(data):
to_be_encoded = ["publisher", "developer", "genres"]
to_be_encoded = ["publisher", "developer"]
# remove everything but the first element in those columns and categorize everything
for val in to_be_encoded:
......@@ -52,15 +52,43 @@ def label_encoding(data):
# @return: pandas dataframe with genre, # of purchases, time interval columns
def genre_data_aggregation(data, interval):
data["genres"] = data["genres"].apply(lambda x: str(x).split(","))
# preprocess the release date column into the pandas datetime format
data['release_date'] = pd.to_datetime(data['release_date'], dayfirst=True, format="mixed")
# remove whitespaces
data['genres'] = data['genres'].map(lambda x: list(map(str.strip, x)))
data = data[['release_date', 'genres', 'owners']]
agg_data = pd.DataFrame(columns=['genre','dates','populations'])
# big FOR loop, for now
agg_data = data.explode('genres')
agg_data = agg_data.groupby("genres")
#describe()
agg_data = [group for _, group in agg_data]
for x in agg_data:
x.dropna(how='any', inplace=True)
x.sort_values(by=['release_date'], ascending=[True], inplace=True)
#x = x.groupby( [pd.Grouper(key='release_date', freq=str(interval)+"M"), pd.Grouper('genres')] ).agg({'owners': 'sum'})
# remove excessive columns and sort values
data = data[['release_date', 'genres', 'owners']].sort_values(['release_date','genres', 'owners'], ascending=[True, True, False])
# data = data[['release_date', 'genres', 'owners']].sort_values(['release_date','genres', 'owners'], ascending=[True, True, False])
dict_data = {}
# group by the time interval and get sum of the owners
data = data.groupby( [pd.Grouper(key='release_date', freq=str(interval)+"M"), pd.Grouper('genres')] ).agg({'owners': 'sum'})
return data
for i in range(0, len(agg_data)):
name = agg_data[i]['genres'].iloc[0]
agg_data[i] = agg_data[i].groupby(pd.Grouper(key='release_date', freq=str(interval)+"M")).agg({'owners': 'sum'})
agg_data[i] = agg_data[i].reset_index()
dict_data[name] = agg_data[i]
return dict_data
# Resets Index of the merged dataframe
def clean_index(data):
......@@ -84,23 +112,83 @@ def replace_owner_str_with_average_number(data):
data["owners"] = data["owners"].apply(lambda name: float(sum(name)/len(name)))
return data
# Encodes time as numbers for processing
# @param: pandas dataframe with the full data
# @return: pandas dataframe with time modified
def encode_time(df):
df['release_date']=df['release_date'].map(dt.datetime.toordinal)
return df
def lin_reg(df):
y = np.asarray(df['owners'])
X = df[['release_date']]
#X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=.7,random_state=42)
model = LinearRegression() #create linear regression object
#model.fit(X_train, y_train) #train model on train data
model.fit(X, y)
#model.score(X_train, y_train) #check score
return model
# Plot the given genre data
def get_genre_plot(dict_data, genre):
plt.scatter(dict_data[genre]["release_date"], dict_data[genre]["owners"])
plt.show()
def get_data_interval(days):
base = dt.datetime.now()
final_date = base + dt.timedelta(days=days)
parts = list(pd.date_range(pd.Timestamp(base), pd.Timestamp(final_date), freq='2M'))
dates = [t.toordinal() for t in parts]
return dates
if __name__ == "__main__":
import pandas as pd
import numpy as np
import datetime as dt
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from math import isnan
full_data_df = read_data()
full_data_df = clean_index(full_data_df)
label_encoding(full_data_df)
data = replace_owner_str_with_average_number(full_data_df)
genre_data = genre_data_aggregation(full_data_df, 2)
genre = "Action"
get_genre_plot(genre_data, genre)
# 730 days = 2 years
dates = np.array(get_data_interval(730))
dates = dates.reshape(len(dates), 1)
# GET ALL MODELS FOR ALL GENRES
models = {}
predictions = {}
process_data = genre_data
for x in process_data:
genre_data[x] = encode_time(genre_data[x])
models[x] = lin_reg(genre_data[x])
for genre in models:
predictions[genre] = models[genre].predict(dates)
# GET POINT OF REFERENCE
pass
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment