Skip to content
Snippets Groups Projects
main.py 7.22 KiB
Newer Older
Max Väistö's avatar
Max Väistö committed
import itertools
import datetime
from concurrent.futures import ThreadPoolExecutor
Max Väistö's avatar
Max Väistö committed

import requests
import json
import pandas
import time
import numpy
Max Väistö's avatar
Max Väistö committed
import matplotlib.pyplot as plt
from pandas.core.dtypes.common import is_numeric_dtype

STEAMSPY_ALL_GAMES_URL = "https://steamspy.com/api.php?request=all&page="
STEAM_GAME_INFO_URL = "https://store.steampowered.com/api/appdetails?appids="
STEAM_API_LANGUAGE = "&l=english"
STEAM_SPY_GAME_INFO = "https://steamspy.com/api.php?request=appdetails&appid="
Max Väistö's avatar
Max Väistö committed


# There are 67 pages of data but for the heck of it,
# we're going try to load 100 pages
def get_all_data(iterations: int = 100, num_threads: int = 4):
    def get_api_data_for_game_threaded(id):
        steam_api_data = get_additional_game_data_steam(str(id))
        steamspy_api_data = get_additional_game_data_steamspy(str(id))
        return steam_api_data, steamspy_api_data
    # def get_additional_game_data_steam_threaded(id):
    #     return get_additional_game_data_steam(str(id))
    # def get_additional_game_data_steamspy_threaded(id):
    #     return get_additional_game_data_steamspy(str(id))
Max Väistö's avatar
Max Väistö committed
    for i in range(iterations):
        print(i)
        url = STEAMSPY_ALL_GAMES_URL + str(i)
Max Väistö's avatar
Max Väistö committed
        try:
            response = json.loads(requests.get(url).text)
        except Exception as some_shit:
            print(some_shit)
            break
        games = [value for (key, value) in response.items()]
        if i == 0:
            df = pandas.DataFrame(games)
        else:
            df = pandas.concat([df, pandas.DataFrame(games)], ignore_index=True, sort=False)

    # df = df.iloc[0:10]
    # with ThreadPoolExecutor(max_workers=num_threads) as executor:
    #     steam_results = list(executor.map(get_additional_game_data_steam_threaded, df["appid"]))
    #
    # with ThreadPoolExecutor(max_workers=num_threads) as executor:
    #     steamspy_results = list(executor.map(get_additional_game_data_steamspy_threaded, df["appid"]))
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        combined_results = list(executor.map(get_api_data_for_game_threaded, df["appid"]))

    steam_results = [result[0] for result in combined_results]
    steamspy_results = [result[1] for result in combined_results]

    df = pandas.concat([df, pandas.DataFrame(steam_results), pandas.DataFrame(steamspy_results)], axis=1)

    # steam_api_data = df["appid"].iloc[:].apply(lambda x: get_additional_game_data_steam(str(x)))
    # df = pandas.concat([df, steam_api_data], axis=1)
Max Väistö's avatar
Max Väistö committed
    return df


def get_additional_game_data_steam(id):
    fails = 0
    base_wait = 60
    url = STEAM_GAME_INFO_URL + id + STEAM_API_LANGUAGE

    response = None
    json_str = requests.get(url).text
    json_load_error = False
    try:
        response = json.loads(requests.get(url).text)
    except Exception as ex:
        print("Error occurred while parsing json from Steam:", ex)
        json_load_error = True
    print(datetime.datetime.now())

    while (not response or response[id]["success"] == False) or json_load_error:
        # This part is meant to catch erros in the loading process
        if (response and response[id]["success"] == False) or json_load_error:
            fails += 1
            json_load_error = False
            if fails >= 10:
                return pandas.Series({'platforms': numpy.NaN, 'release_date': numpy.NaN, 'categories': numpy.NaN})
        print("Failed queries for", id, "is", fails)
        time.sleep(base_wait)
        try:
            response = json.loads(requests.get(url).text)
        except Exception as ex:
            print("Error occurred while parsing json from Steam:", ex)
            json_load_error = True

    data = response[id]["data"]
    if data["type"] != "game":
        print("Non game found", data["name"])
    platforms = [platform for (platform, enabled) in data["platforms"].items() if enabled]
    release_date = data["release_date"]["date"]
    categories = [category_data["description"] for category_data in
                  data["categories"]] if "categories" in data.keys() else []
    return_values = pandas.Series({'platforms': platforms, 'release_date': release_date, 'categories': categories})
    return return_values


def get_additional_game_data_steamspy(id):
    url = STEAM_SPY_GAME_INFO + id
    response = None
    fails = 0
    while not response:
        try:
            response = json.loads(requests.get(url).text)
        except Exception as ex:
            print("Error occurred while parsing json from Steam:", ex)
            fails += 1
            if fails >= 10:
                return pandas.Series({"ccu": numpy.NaN, 'languages': numpy.NaN, 'genres': numpy.NaN, "tags": numpy.NaN})

    response = json.loads(requests.get(url).text)
    languages = response["languages"].split(", ")
    genres = response["genre"]
    ccu = response["ccu"]
    tags = [tag for (tag, tag_id) in response["tags"].items()] if response["tags"] else []

    return_values = pandas.Series(dict(ccu=ccu, languages=languages, genres=genres, tags=tags))
    return return_values


Max Väistö's avatar
Max Väistö committed
def add_user_rating(df):
    def user_rating_function(pos, neg):
        if pos == neg == 0:
            return 0
        return pos / (pos + neg)

    df["Review_rating"] = df.apply(lambda row: user_rating_function(row.positive, row.negative), axis=1)
    return df


def create_hist_plots(df):
    for col_name in df.columns:
        if is_numeric_dtype(df[col_name]):
            fig = plt.figure()
            plt.hist(df[col_name], log=True)
            title = " ".join([col_name, "log histogram"])
            plt.title(title)
            fig.savefig("".join(["images\\", title, ".png"]))
Max Väistö's avatar
Max Väistö committed
            plt.show()


def replace_owner_number_with_symbol(df):
    def owner_strip(user_range: str):
        user_range = user_range.replace(",000,000", " M")
        user_range = user_range.replace(",000", " k")
        return user_range

    df["owners"] = df["owners"].apply(lambda name: owner_strip((name)))
    return df


def create_heat_maps(df, plot_pairs):
    for (x, y) in plot_pairs:
        plt.figure()  # Create a new figure for each heatmap
        heatmap = plt.imshow(df[[x, y]].values, cmap='hot', interpolation='nearest', aspect='auto')
        plt.colorbar(heatmap)  # Add a colorbar
        plt.xlabel(x)
        plt.ylabel(y)
        plt.title(f"Heatmap of {x} vs {y}")
        plt.show()


def price_to_dollars(convert_df):
    convert_df["price"] = convert_df["price"].apply(lambda val: int(val) / 100 if int(val) != 0 else 0)
Max Väistö's avatar
Max Väistö committed
    return convert_df


if __name__ == "__main__":
    df = get_all_data(2)
Max Väistö's avatar
Max Väistö committed
    df = add_user_rating(df)
    df = replace_owner_number_with_symbol(df)
    df = price_to_dollars(df)
    df.to_csv("game_data_experimental.csv")
Max Väistö's avatar
Max Väistö committed
    h = df.describe()
    j = df.isna().sum()
    c = df["userscore"].value_counts()

    numeric_cols = [col for col in df.columns if is_numeric_dtype(df[col])]
    plot_pairs = list(itertools.combinations(numeric_cols, 2))
    print(df.columns)
    print("price", df["price"].unique())
    print("discount", df["discount"].unique())
    print("owners", df["owners"].unique())
    create_hist_plots(df)
    plt.hist(df["owners"], log=True)
    plt.xticks(rotation='vertical')
    plt.title("Histogram of game playerbase sizes with log scale")
    plt.tight_layout()
    plt.show()
    df.to_csv("game_data_full.csv")
Max Väistö's avatar
Max Väistö committed
    pass