Skip to content
Snippets Groups Projects
utils.py 2.89 KiB
Newer Older
  • Learn to ignore specific revisions
  • from typing import Sequence, Optional, Any
    
    import pandas
    
    DEFAULT_ILLEGAL_CONTINUATIONS = {"INC.", "LLC", "CO.", "LTD.", "S.R.O."}
    
    
    def get_owner_means(owner_limits: Sequence[Any]):
    
        if not isinstance(owner_limits, list):
            return owner_limits
        else:
            return (owner_limits[0]+owner_limits[1])/2
    
    def convert_owners_to_limits(owner_limit):
        if not isinstance(owner_limit, str):
            return owner_limit
        owners_raw = [rev.replace(" ", "") for rev in owner_limit.split(" .. ")]
        owners_clean = []
        for owner_limit in owners_raw:
            owner_limit = owner_limit.replace("M", "0"*6)
            owner_limit = owner_limit.replace("k", "0"*3)
            owners_clean.append(int(owner_limit))
        return owners_clean
    
    Max Väistö's avatar
    Max Väistö committed
    def split_companies(arr, illegal_continuations: Optional[Sequence[str]] = None):
    
        """
        Splits the given string at comma sign as long as following the comma none of the illegal
        continuations happen. In such a case, the string split does not happen that said comma.
        :param arr: Array containing the developers/publishers for a single game
        :param illegal_continuations: A list of illegal continuations. Must be uppercase.
        :return: Returns the given split input string as a list.
        :note: If the arr is numpy.NaN, this value is returned instead of a list.
        """
        if illegal_continuations is None:
            illegal_continuations = DEFAULT_ILLEGAL_CONTINUATIONS
        if pandas.isna(arr):
            return arr
    
        results_list = []
        start_index = 0
        split_char = ", "
    
        for index in range(len(arr)):
            if index < len(arr) - 1:
                txt = arr[index:index + 2]
                if txt == split_char:
                    found_illegal = False
                    min_continuation = min([len(continuation) for continuation in illegal_continuations])
                    max_continuation = max([len(continuation) for continuation in illegal_continuations])
                    next_chars = arr[index + min_continuation:index + min_continuation + max_continuation]
                    for i in range(index + min_continuation, index + len(next_chars) + 2):
                        comp_txt = arr[index + 2:i + 2].upper()
                        if comp_txt in illegal_continuations:
                            found_illegal = True
                            break
                    if not found_illegal:
    
                        results_list.append(arr[start_index:index].strip())
    
                        start_index = index + 1
            elif index == len(arr) - 1:
    
                results_list.append(arr[start_index:index + 1].strip())
    
    
        return results_list
    
    
    
    def extract_unique_companies(nested_companies):
        full_company_list = [dev for company_list in nested_companies
                             if isinstance(company_list, list) for dev in company_list]
        unique_companies = []
        for company in full_company_list:
            if company not in unique_companies:
                unique_companies.append(company)
        return unique_companies