Skip to content
Snippets Groups Projects
Project_data_processor_ML.py 6.61 KiB
Newer Older
  • Learn to ignore specific revisions
  • # Introduction to Data Science, Luento-opetus, 2023
    # Sergei Panarin
    
    
    
    Max Väistö's avatar
    Max Väistö committed
    
    
    # Data preprocessing
    
    # IMPORTANT THINGS:
    # Change the number of input files in the read_data function OR later replace with full data file
    # FUNCTION replace owner with symbol is copied from the main branch, DELETE LATER and IMPORT
    
    
    Max Väistö's avatar
    Max Väistö committed
    import pandas as pd
    import numpy as np
    import datetime as dt
    import re
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import LinearRegression
    import matplotlib.pyplot as plt
    from math import isnan
    
    
    
    # Reading the data from source:
    # Pandas cannot read from GitLab URLs, so work is done with locally stored datasets.
    # Go through partial files containing parts of the whole dataset
    # @param  
    # @return: full pandas dataframe, containing all the relevant columns from the csv files
    def read_data():
        # full_data_df = pd.read_csv("full_data.csv")
    
        partial_dfs = []
    
        # range can be changed, reads segments of the full dataset
    
    Max Väistö's avatar
    Max Väistö committed
        for index in range(66):
            datafile="api_exploration/file_segments/game_data_"+str(index+1)+".csv"
    
            partial_dfs.append(pd.read_csv(datafile))
        
        # Combine partial dataframes into the full version
        full_data_df = pd.concat(partial_dfs)
    
        return full_data_df
    
    # transform String columns into integer IDs.
    # @param: pandas dataframe with the full data
    # @return: pandas dataframe with developer, publisher, genres columns encoded with integer IDs
    def label_encoding(data):
    
        to_be_encoded = ["publisher", "developer"]
    
    
        # remove everything but the first element in those columns and categorize everything
        for val in to_be_encoded:
            data[val] = data[val].apply(lambda x: str(x).split(",")[0])
            data[val] = data[val].astype('category')
    
    
        # select the category columns and apply label encoding
        cat_columns = data.select_dtypes(['category']).columns
        data[cat_columns] = data[cat_columns].apply(lambda x: x.cat.codes)
    
    # Transforms full data dataframe into a dataframe with the following columns:
    # - Genre
    # - Amount of purchases
    # - Interval of time (default chosen as 2 months for now)
    # @param: pandas dataframe with the full data, interval integer meaning the number of months
    # @return: pandas dataframe with genre, # of purchases, time interval columns
    
    def genre_data_aggregation(data, interval):
    
    
        data["genres"] = data["genres"].apply(lambda x: str(x).split(","))
    
    
        # preprocess the release date column into the pandas datetime format
    
        data['release_date'] = pd.to_datetime(data['release_date'], dayfirst=True, format="mixed", errors='coerce')
    
        
        # remove whitespaces
        data['genres'] = data['genres'].map(lambda x: list(map(str.strip, x)))
        data = data[['release_date', 'genres', 'owners']]
        
        agg_data = pd.DataFrame(columns=['genre','dates','populations'])
        # big FOR loop, for now
        
        agg_data = data.explode('genres')
        agg_data = agg_data.groupby("genres")
        #describe()
        agg_data = [group for _, group in agg_data]
        
        for x in agg_data:
            x.dropna(how='any', inplace=True)
            x.sort_values(by=['release_date'], ascending=[True], inplace=True)
            #x = x.groupby( [pd.Grouper(key='release_date', freq=str(interval)+"M"), pd.Grouper('genres')] ).agg({'owners': 'sum'})
        
        
    
        # remove excessive columns and sort values
    
        # data = data[['release_date', 'genres', 'owners']].sort_values(['release_date','genres', 'owners'], ascending=[True, True, False])
    
        # group by the time interval and get sum of the owners
    
        for i in range(0, len(agg_data)):  
            name = agg_data[i]['genres'].iloc[0]
            agg_data[i] = agg_data[i].groupby(pd.Grouper(key='release_date', freq=str(interval)+"M")).agg({'owners': 'sum'})
            agg_data[i] = agg_data[i].reset_index()
            dict_data[name] = agg_data[i]
            
            
        return dict_data
    
    
    # Resets Index of the merged dataframe
    def clean_index(data):
        return data.reset_index(drop=True)
    
    # Transforms owners column values from str of range of values into the average float value
    # @param: pandas dataframe with the full data, interval integer meaning the number of months
    # @return: pandas dataframe with owners column modified
    def replace_owner_str_with_average_number(data):
        def replace_letters(entry):
            to_remove = {" M": "000000", " k": "000"}
    
            for char in to_remove.keys():
                entry = entry.replace(char, to_remove[char])
    
            return entry
            
        data["owners"] = data["owners"].apply(lambda name: replace_letters(name))
        data["owners"] = data["owners"].apply(lambda name: re.findall("\d+",name))
        data["owners"] = data["owners"].apply(lambda name: [int(item) for item in name])
        data["owners"] = data["owners"].apply(lambda name: float(sum(name)/len(name)))
        return data
    
    
    # Encodes time as numbers for processing
    # @param: pandas dataframe with the full data
    # @return: pandas dataframe with time modified
    def encode_time(df):
        df['release_date']=df['release_date'].map(dt.datetime.toordinal)
        return df
    
    def lin_reg(df):
        
        y = np.asarray(df['owners'])
        X = df[['release_date']]
        #X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=.7,random_state=42)
        
        model = LinearRegression() #create linear regression object
        #model.fit(X_train, y_train) #train model on train data
        model.fit(X, y)
        #model.score(X_train, y_train) #check score
        return model
    
    # Plot the given genre data 
    
    Max Väistö's avatar
    Max Väistö committed
    def get_genre_plot(dict_data: object, genre: object) -> object:
        plt.scatter(dict_data[genre]["release_date"], dict_data[genre]["owners"])
    
        plt.show()
        
    def get_data_interval(days):
        base = dt.datetime.now()
        
        final_date = base + dt.timedelta(days=days)
    
        parts = list(pd.date_range(pd.Timestamp(base), pd.Timestamp(final_date), freq='2M')) 
    
        dates = [t.toordinal() for t in parts]
        return dates
        
    
    if __name__ == "__main__":
    
    
    Max Väistö's avatar
    Max Väistö committed
    
    
        full_data_df = read_data()
        full_data_df = clean_index(full_data_df)
    
        label_encoding(full_data_df)
        data = replace_owner_str_with_average_number(full_data_df)
        genre_data = genre_data_aggregation(full_data_df, 2)
    
    Max Väistö's avatar
    Max Väistö committed
        genre = "Free to Play"
    
        
        get_genre_plot(genre_data, genre)
        
        # 730 days = 2 years
        dates = np.array(get_data_interval(730))
        dates = dates.reshape(len(dates), 1)
        
        # GET ALL MODELS FOR ALL GENRES
        models = {}
        predictions = {}
        process_data = genre_data
        for x in process_data:  
            genre_data[x] = encode_time(genre_data[x])
            models[x] = lin_reg(genre_data[x])
            
        for genre in models:
            predictions[genre] = models[genre].predict(dates)
            
        # GET POINT OF REFERENCE