#IMPORTS - These are general imports we will use
import pandas as pd
import numpy as np
import matplotlib as mp
import re

#DEBUG CODE
import ssl
ssl._create_default_https_context = ssl._create_unverified_context


#PART I - Data Collection

#Wiki-links:
ubisoft_games = ['https://en.wikipedia.org/wiki/List_of_Ubisoft_games:_2000%E2%80%932009', 
                 'https://en.wikipedia.org/wiki/List_of_Ubisoft_games:_2010%E2%80%932019', 
                 'https://en.wikipedia.org/wiki/List_of_Ubisoft_games:_2020%E2%80%93present']
ea_games = ['https://en.wikipedia.org/wiki/List_of_Electronic_Arts_games:_2000%E2%80%932009',
            'https://en.wikipedia.org/wiki/List_of_Electronic_Arts_games:_2010%E2%80%93present']
activision_games = ['https://en.wikipedia.org/wiki/List_of_Activision_games:_2000%E2%80%932009',
                    'https://en.wikipedia.org/wiki/List_of_Activision_games:_2010%E2%80%932019',
                    'https://en.wikipedia.org/wiki/List_of_Activision_games:_2020%E2%80%93present']

#Company Dataframes for different years in a list
ubisoft_releases = [pd.read_html(ubisoft_games[0])[1], pd.read_html(ubisoft_games[1])[1], pd.read_html(ubisoft_games[2])[1]]
ea_releases = [pd.read_html(ea_games[0])[1], pd.read_html(ea_games[1])[1]]
activision_releases = [pd.read_html(activision_games[0])[0], pd.read_html(activision_games[1])[0], pd.read_html(activision_games[2])[0]]

#Example of the list of games ubisoft released between 2000 and 2009
ubisoft_releases[0].head()


#Join each companies dataframes of releases from different time periods into one
ubisoft_df = pd.concat(ubisoft_releases)
ea_df = pd.concat(ea_releases)
activision_df = pd.concat(activision_releases)

#First lets reset the index on the new dataframe because now we have entries that have the same index which can make our lives harder
ubisoft_df = ubisoft_df.reset_index(drop=True)
ea_df = ea_df.reset_index(drop=True)
activision_df = activision_df.reset_index(drop=True)

#Now lets remove all games where release date is TBA because they're not released yet, and so we have no use for it
ubisoft_df = ubisoft_df[ubisoft_df["Release date"] != 'TBA'][:-1] #There reason why I exclude the last row is because it was relative to an unreleased dlc
ea_df = ea_df[ea_df["Release date"] != 'TBA']
activision_df = activision_df[activision_df["Release date"] != 'TBA']

#Now we have to isolate the year from the release date. It's a string so we need to only slice out the last 4 digits and convert that to an int
for i in range(len(ubisoft_df)):
    ubisoft_df["Release date"][i] = int(ubisoft_df["Release date"][i][-4:])

for i in range(len(ea_df)):
    ea_df["Release date"][i] = int(ea_df["Release date"][i][-4:])

for i in range(len(activision_df)):
    activision_df["Release date"][i] = int(activision_df["Release date"][i][-4:])

#Filter dates such that we're looking at games between 2005 and 2021
#This will remove any releases before 2005 and games aren't released yet

#We'll make a function to do just that so that it looks more elegant
def filter(df,col_name,start, end):
    filter_before_start = df[df[col_name] >= start]
    filter_after_end = filter_before_start[filter_before_start[col_name] <= end]
    return filter_after_end

#Give me the values between 2005 and 2021 for all the dataframes
ubisoft_df = filter(ubisoft_df, "Release date", 2005, 2021).sort_values(by="Release date").reset_index(drop=True)
ea_df = filter(ea_df, "Release date", 2005, 2021).sort_values(by="Release date").reset_index(drop=True)
activision_df = filter(activision_df, "Release date", 2005, 2021).sort_values(by="Release date").reset_index(drop=True)

#Finally let's remove that useless Ref column
ubisoft_df = ubisoft_df.drop(columns="Ref.")
ea_df = ea_df.drop(columns="Ref(s)") #In the EA dataframe the Ref column is labeled as Ref(s) not Ref.
activision_df = activision_df.drop(columns="Ref.")

ubisoft_df.head()


ubisoft_df.tail()


#Lets start with getting a list of games without duplicates for each company
ubisoft_titles = ubisoft_df['Title'].drop_duplicates()
ea_titles = ea_df['Title'].drop_duplicates()
activision_titles = activision_df['Title'].drop_duplicates()

#Let's make a function that will merge all platforms into one entry so that each game would have one entry
def merge_platforms(games, df, platform_string): #The platform_string is because each dataframe's platforms column name is slightly different
    rows = [] #We'll put row values in here
    for game in games:
        game_entries = df[df['Title'] == game]
        platforms_list = list(game_entries[platform_string])
        #set switch to lower case, remove Amazon Luna and Stadia because Stadia flopped and Luna is an unreleased "platform"
        for i,platform in enumerate(platforms_list):
            if platform == 'Nintendo Switch':
                platforms_list[i] = 'switch'
            elif platform == 'Amazon Luna' or platform == 'Stadia' or platform == 'Mobile phones' or platform == 'iOS' or platform == 'Android' or platform == 'N-Gage':
                platforms_list[i] = "-" #I will replace it so that I can remove it later. I can't delete it now while the list is iterating
            elif platform == 'PlayStation Portable':
                platforms_list[i] = 'psp'
            elif platform == 'Nintendo 3DS':
                platforms_list[i] = '3ds'
            elif platform == 'Nintendo DS':
                platforms_list[i] = 'ds'
            elif platform == 'Xbox Series X/S':
                platforms_list[i] = 'xbox-series-x'
            elif platform == 'Microsoft Windows' or platform == 'Windows':
                platforms_list[i] = 'pc'
        if "-" in platforms_list:
            platforms_list.remove('-')
        platforms_joined = ','.join(platforms_list).lower().replace(' ', '-')
        release_date = game_entries.iloc[0]['Release date']
        developers = game_entries.iloc[0]['Developer(s)']
        new_entry = [game, platforms_joined, release_date, developers]
        rows.append(new_entry)
    return pd.DataFrame(data=rows, columns=['titles', 'platforms', 'release_date', 'developers'])

updated_ubisoft_df = merge_platforms(ubisoft_titles, ubisoft_df, 'Platform(s)')
updated_ea_df = merge_platforms(ea_titles, ea_df, 'Platforms')
updated_ea_df = updated_ea_df[updated_ea_df['platforms'] != '-'] #I have to do this because there were some mobile exclusives to be removed
updated_activision_df = merge_platforms(activision_titles, activision_df, 'Platform(s)')
updated_activision_df = updated_activision_df[updated_activision_df['platforms'] != ''] #I have to do this because there were some weird consoles like N-Gage???


#This function will take the original game's name and convert it to the metacritic url format
def convert_game_name(game):
    game = ''.join(game.strip().split('[')[:1])
    splitName = re.split("\s|: |-|, |&| – | - ", game)
    splitName = [elem for elem in splitName if elem != ''] #remove any remaining empty strings
    if '–' in splitName:
        splitName.remove('–')
    url_converted_name = '-'.join(splitName).lower().replace("'", "") #replace single quote chars
    if '–' in url_converted_name: #I have to check again because sometimes urls have --- in it
        url_converted_name = url_converted_name.replace('–', "-")
    if '.' in url_converted_name:
        url_converted_name = url_converted_name.replace('.','')    
    return url_converted_name

#lets add a new column to the dataframes to contain the converted names so that we have both in the same dataframe.
updated_ubisoft_df["converted_name"] = [convert_game_name(name) for name in list(updated_ubisoft_df["titles"])]
updated_ea_df["converted_name"] = [convert_game_name(name) for name in list(updated_ea_df["titles"])]
updated_activision_df["converted_name"] = [convert_game_name(name) for name in list(updated_activision_df["titles"])]

updated_ubisoft_df.head()


import requests as r
from bs4 import BeautifulSoup as BS


#Let's start by making a simple request to metacritic and familiarize ourselves with how to use scrapy to extract the user review scores.

#We'll start by looking through the documentation. Let's pull something from this metacritic url: https://www.metacritic.com/game/playstation-4/injustice-2

#We'll put this in a try except block because requests could cause an error if request is blocked
try:
    injustice2_req_response = r.get("https://www.metacritic.com/game/playstation-4/injustice-2")
except:
    print("Request timed out")

Request timed out


#Copy only these ones from the headers, as the rest is already filled in by the requests library
headers = {
    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    "accept-encoding": "gzip, deflate, br",
    "accept-language": "en-US,en;q=0.9",
    "cache-control": "no-cache",
    "pragma": "no-cache",
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36"
}

try:
    injustice2_req_response = r.get("https://www.metacritic.com/game/playstation-4/injustice-2", headers=headers)
    print("Request Status Code:",injustice2_req_response.status_code, " -> which = SUCCESS")
except:
    print("Request timed out")

Request Status Code: 200  -> which = SUCCESS


injustice2_page_content = injustice2_req_response.content #If I print this it will look really ugly like so

print("HTML: \n")
injustice2_page_content[:500] #Showing first 500 characters in the page

HTML:

b'<!DOCTYPE html>\n<html lang="en">\n<head>\n        \t<title>Injustice 2 for PlayStation 4 Reviews - Metacritic</title>\n\t\n    \n    <meta http-equiv="content-type" content="text/html; charset=UTF-8">\n\n    \n    <meta name="description" content="Metacritic Game Reviews, Injustice 2 for PlayStation 4, Every battle defines you. Power up and build the ultimate version of your favorite DC legends in INJUSTICE 2. With a massive selection of...">\n\n    \n    \n    <meta name="viewport" content="width=1024">\n\n   '


soup = BS(injustice2_page_content, 'html.parser')

str(soup.prettify())[:1000]

'<!DOCTYPE html>\n<html lang="en">\n <head>\n  <title>\n   Injustice 2 for PlayStation 4 Reviews - Metacritic\n  </title>\n  <meta content="text/html; charset=utf-8" http-equiv="content-type"/>\n  <meta content="Metacritic Game Reviews, Injustice 2 for PlayStation 4, Every battle defines you. Power up and build the ultimate version of your favorite DC legends in INJUSTICE 2. With a massive selection of..." name="description"/>\n  <meta content="width=1024" name="viewport"/>\n  <meta content="Metacritic" name="application-name"/>\n  <meta content="#000000" name="msapplication-TileColor"/>\n  <meta content="/images/win8tile/76bf1426-2886-4b87-ae1c-06424b6bb8a2.png" name="msapplication-TileImage"/>\n  <meta content="618k3mbeki8tar7u6wvrum5lxs5cka" name="facebook-domain-verification">\n   <meta content="Injustice 2" property="og:title"/>\n   <meta content="game" property="og:type"/>\n   <meta content="https://www.metacritic.com/game/playstation-4/injustice-2" property="og:url"/>\n   <meta content="https://'


#use soup.find([id="x" or class_="y"]) to get the element of interest.
injustice2_user_score = soup.find(class_="large").text
rating_amount = soup.find('a', href="/game/playstation-4/injustice-2/user-reviews", class_=None).text.split(" ")[0]
print(f"The injustice 2 user score is {injustice2_user_score} on the playstation 4 with {rating_amount} ratings")

The injustice 2 user score is 8.0 on the playstation 4 with 794 ratings


def create_url_from_game(name,platform):
    #end result should look like this: https://www.metacritic.com/game/playstation-4/injustice-2
    base_url = "https://www.metacritic.com/game/"
    url_name = f"{base_url}{platform}/{name}"
    return url_name

def collect_user_scores_for_games_list(df):
    # scores_and_ratings = {} #we'll store the game's info in a key-value pair format to make our lives easier.
    scores_and_ratings = []
    for idx, (titles,platforms,release_date,developers,converted_name) in df.iterrows():
        for platform in platforms.split(","):
            url = create_url_from_game(converted_name, platform)
            try:
                response = r.get(url,headers=headers)
                html = response.content
                soup = BS(html,'html.parser')
                user_score = soup.find(class_="large").text
                rating_amount = soup.find('a', href=f"{url[26:]}/user-reviews", class_=None).text.split(" ")[0]
                # if scores_and_ratings[titles] == None:
                #     scores_and_ratings[titles]= [{"score": user_score, "rating amt": rating_amount, "platform": platform}]
                # else:
                #     scores_and_ratings[titles].append({"score": user_score, "rating amt": rating_amount, "platform": platform})
                scores_and_ratings.append([titles, platform, user_score, rating_amount, release_date])
            except:
                print(f"request for {url} timed out")
    return pd.DataFrame(data=scores_and_ratings, columns=['title', 'platform', 'score', 'ratings', 'year'])


#I left this commented out because in total this takes a long time to run and I've saved the outcome of it in a csv file from a previous run
#If you want to try it out and see if it works you can uncomment it and let it run

# activision_scores_and_ratings = collect_user_scores_for_games_list(updated_activision_df)


#Code that I used from the first run to make a csv file out of the dataframe.

#Show the dataframe and save it to a csv file so we don't have to run these requests again
# activision_scores_and_ratings.to_csv('./activision_scores_and_ratings.csv', index=False)
# activision_scores_and_ratings


#I left this commented out because in total this takes a long time to run and I've saved the outcome of it in a csv file from a previous run
#If you want to try it out and see if it works you can uncomment it and let it run

# ubisoft_scores_and_ratings = collect_user_scores_for_games_list(updated_ubisoft_df)


#Code that I used from the first run to make a csv file out of the dataframe.

# ubisoft_scores_and_ratings.to_csv('./ubisoft_scores_and_ratings.csv', index=False)
# ubisoft_scores_and_ratings


# ea_scores_and_ratings = collect_user_scores_for_games_list(updated_ea_df)


#Code that I used from the first run to make a csv file out of the dataframe.

# ea_scores_and_ratings.to_csv('./ea_scores_and_ratings.csv', index=False)
# ea_scores_and_ratings


#Reassign the scores_and_ratings variables to use the csv files instead because we'll leave the code that sends requests commented out
#for the sake of saving time when running the notebook

ubisoft_scores_and_ratings = pd.read_csv('./ubisoft_scores_and_ratings.csv')
ubisoft_scores_and_ratings = ubisoft_scores_and_ratings[ubisoft_scores_and_ratings['score'] != 'tbd']

ea_scores_and_ratings = pd.read_csv('./ea_scores_and_ratings.csv')
ea_scores_and_ratings = ea_scores_and_ratings[ea_scores_and_ratings['score'] != 'tbd']

activision_scores_and_ratings = pd.read_csv('./activision_scores_and_ratings.csv')
activision_scores_and_ratings = activision_scores_and_ratings[activision_scores_and_ratings['score'] != 'tbd']

ubisoft_scores_and_ratings['score'] = ubisoft_scores_and_ratings['score'].astype(float)
ea_scores_and_ratings['score'] = ea_scores_and_ratings['score'].astype(float)
activision_scores_and_ratings['score'] = activision_scores_and_ratings['score'].astype(float)

combined_scores_and_ratings = pd.concat([ubisoft_scores_and_ratings, ea_scores_and_ratings, activision_scores_and_ratings])


print(f"We started with this many games: {len(ubisoft_titles) + len(ea_titles) + len(activision_titles)}, but ended with this many games: {len(combined_scores_and_ratings['title'].drop_duplicates())}")
combined_scores_and_ratings

We started with this many games: 1439, but ended with this many games: 909


#Let's make a dataframe where now we average the scores for every game

def average_game_scores(df):
    rows = []
    titles = list(df['title'].drop_duplicates())
    for title in titles:
        sub_df = df[df['title'] == title]
        mean_score = round(np.mean(list(sub_df['score'])),2)
        total_rating = sum([int(elem) for elem in list(sub_df['ratings'])])
        rows.append([title, ','.join(list(sub_df['platform'])),mean_score, sub_df.iloc[0]['ratings'], sub_df.iloc[0]['year']])
    return pd.DataFrame(data=rows, columns=['title', 'platform', 'score', 'ratings', 'year'])

ubisoft_avg_scores_and_ratings = average_game_scores(ubisoft_scores_and_ratings)
ea_avg_scores_and_ratings = average_game_scores(ea_scores_and_ratings)
activision_avg_scores_and_ratings = average_game_scores(activision_scores_and_ratings)
combined_avg_scores_and_ratings = pd.concat([ubisoft_avg_scores_and_ratings, ea_avg_scores_and_ratings, activision_avg_scores_and_ratings])


#PART II: Data Visualization

#import library we'll use to make plots.
import matplotlib.pyplot as plt


#Ubisoft Plot

years = list(combined_scores_and_ratings['year'].drop_duplicates())

scores_per_year = [list(ubisoft_avg_scores_and_ratings[ubisoft_avg_scores_and_ratings['year'] == year]['score']) for year in years]

fig, ubisoft_score_plot = plt.subplots(figsize=(20,10))
ubisoft_score_plot.set_xticklabels(years)
ubisoft_score_plot.set_xlabel("Years")
ubisoft_score_plot.set_ylabel("Scores")
ubisoft_score_plot.set_yticks([i for i in range(0,11)])
ubisoft_score_plot.set_title("Ubisoft Average Game Scores For Years 2005-2021")
# ubisoft_score_plot.scatter(x=list(ubisoft_scores_and_ratings['year']), y=list(ubisoft_scores_and_ratings['score']))
ubisoft_score_plot.boxplot(scores_per_year)
plt.show()

/var/folders/lv/z8y9dg7d5p37pxgkd59y22ph0000gn/T/ipykernel_83434/3953058070.py:8: UserWarning: FixedFormatter should only be used together with FixedLocator
  ubisoft_score_plot.set_xticklabels(years)


print(f"Ubisoft's Lowest Score: {np.min(list(ubisoft_avg_scores_and_ratings['score']))} for {ubisoft_avg_scores_and_ratings[ubisoft_avg_scores_and_ratings['score'] == 2.6].iat[0,0]}")
print(f"Ubisoft's Highest Score: {np.max(list(ubisoft_avg_scores_and_ratings['score']))} for {ubisoft_avg_scores_and_ratings[ubisoft_avg_scores_and_ratings['score'] == 8.8].iat[0,0]}")

Ubisoft's Lowest Score: 2.6 for ABBA: You Can Dance
Ubisoft's Highest Score: 8.8 for No More Heroes 2: Desperate Struggle


#EA Plot
scores_per_year = [list(ea_avg_scores_and_ratings[ea_avg_scores_and_ratings['year'] == year]['score']) for year in years]

fig, ea_score_plot = plt.subplots(figsize=(20,10))
ea_score_plot.set_xticklabels(years)
ea_score_plot.set_xlabel("Years")
ea_score_plot.set_ylabel("Scores")
ea_score_plot.set_yticks([i for i in range(0,11)])
ea_score_plot.set_title("EA Average Game Scores For Years 2005-2021")
# ea_score_plot.scatter(x=list(ubisoft_scores_and_ratings['year']), y=list(ubisoft_scores_and_ratings['score']))
ea_score_plot.boxplot(scores_per_year)
plt.show()

/var/folders/lv/z8y9dg7d5p37pxgkd59y22ph0000gn/T/ipykernel_83434/1557843471.py:5: UserWarning: FixedFormatter should only be used together with FixedLocator
  ea_score_plot.set_xticklabels(years)


#Activision Plot
scores_per_year = [list(activision_avg_scores_and_ratings[activision_avg_scores_and_ratings['year'] == year]['score']) for year in years]

fig, activision_score_plot = plt.subplots(figsize=(20,10))
activision_score_plot.set_xticklabels(years)
activision_score_plot.set_xlabel("Years")
activision_score_plot.set_ylabel("Scores")
activision_score_plot.set_yticks([i for i in range(0,11)])
activision_score_plot.set_title("Activision Game Scores For Years 2005-2021")
# activision_score_plot.scatter(x=list(ubisoft_scores_and_ratings['year']), y=list(ubisoft_scores_and_ratings['score']))
activision_score_plot.boxplot(scores_per_year)
plt.show()

/var/folders/lv/z8y9dg7d5p37pxgkd59y22ph0000gn/T/ipykernel_83434/3617640842.py:5: UserWarning: FixedFormatter should only be used together with FixedLocator
  activision_score_plot.set_xticklabels(years)


#Combined Plot
scores_per_year = [list(combined_avg_scores_and_ratings[combined_avg_scores_and_ratings['year'] == year]['score']) for year in years]

fig, combined_score_plot = plt.subplots(figsize=(20,10))
combined_score_plot.set_xticklabels(years)
combined_score_plot.set_xlabel("Years")
combined_score_plot.set_ylabel("Scores")
combined_score_plot.set_yticks([i for i in range(0,11)])
combined_score_plot.set_title("Combined Game Scores For Years 2005-2021")
# combined_score_plot.scatter(x=list(combined_scores_and_ratings['year']), y=list(combined_scores_and_ratings['score']))
combined_score_plot.boxplot(scores_per_year)
plt.show()

/var/folders/lv/z8y9dg7d5p37pxgkd59y22ph0000gn/T/ipykernel_83434/2205056272.py:5: UserWarning: FixedFormatter should only be used together with FixedLocator
  combined_score_plot.set_xticklabels(years)


#Before 2014

fig, combined_score_plot = plt.subplots(figsize=(20,10))
combined_score_plot.set_xlabel("Years")
combined_score_plot.set_ylabel("Scores")
combined_score_plot.set_yticks([i for i in range(0,11)])
combined_score_plot.set_title("Combined Game Scores For Years 2005-2013")
combined_score_plot.scatter(x=list(combined_avg_scores_and_ratings[combined_avg_scores_and_ratings['year'] < 2014]['year']), y=list(combined_avg_scores_and_ratings[combined_avg_scores_and_ratings['year'] < 2014]['score']))
m, b = np.polyfit(list(combined_avg_scores_and_ratings[combined_avg_scores_and_ratings['year'] < 2014]['year']), list(combined_avg_scores_and_ratings[combined_avg_scores_and_ratings['year'] < 2014]['score']), 1)
plt.plot(list(combined_avg_scores_and_ratings[combined_avg_scores_and_ratings['year'] < 2014]['year']), m*np.array(list(combined_avg_scores_and_ratings[combined_avg_scores_and_ratings['year'] < 2014]['year']))+b)

print(f"Slope of regression line: {m}")

Slope of regression line: -0.18244342019297186


#After 2014

fig, combined_score_plot = plt.subplots(figsize=(20,10))
combined_score_plot.set_xlabel("Years")
combined_score_plot.set_ylabel("Scores")
combined_score_plot.set_yticks([i for i in range(0,11)])
combined_score_plot.set_title("Combined Game Scores For Years 2014-2021")
combined_score_plot.scatter(x=list(combined_avg_scores_and_ratings[combined_avg_scores_and_ratings['year'] >= 2014]['year']), y=list(combined_avg_scores_and_ratings[combined_avg_scores_and_ratings['year'] >= 2014]['score']))
m, b = np.polyfit(list(combined_avg_scores_and_ratings[combined_avg_scores_and_ratings['year'] >= 2014]['year']), list(combined_avg_scores_and_ratings[combined_avg_scores_and_ratings['year'] >= 2014]['score']), 1)
plt.plot(list(combined_avg_scores_and_ratings[combined_avg_scores_and_ratings['year'] >= 2014]['year']), m*np.array(list(combined_avg_scores_and_ratings[combined_avg_scores_and_ratings['year'] >= 2014]['year']))+b)

print(f"Slope of regression line: {m}")

Slope of regression line: -0.1612871491702159


#The combined averages plot

fig, combined_score_plot = plt.subplots(figsize=(20,10))

combined_score_plot.set_xlabel("Years")
combined_score_plot.set_ylabel("Scores")
combined_score_plot.set_yticks([i for i in range(0,11)])
combined_score_plot.set_title("Combined Game Scores For Years 2005-2021")
combined_score_plot.scatter(x=list(combined_avg_scores_and_ratings['year']), y=list(combined_avg_scores_and_ratings['score']))
m, b = np.polyfit(list(combined_avg_scores_and_ratings['year']), list(combined_avg_scores_and_ratings['score']), 1)
plt.plot(list(combined_avg_scores_and_ratings['year']), m*np.array(list(combined_avg_scores_and_ratings['year']))+b)

print(f"Slope of regression line: {m}")

Slope of regression line: -0.14786041536796318


#Console performance based on rating

platforms = list(combined_scores_and_ratings['platform'].drop_duplicates())
platform_avg_scores = []

for platform in platforms:
    platform_score_mean = np.mean(list(combined_scores_and_ratings[combined_scores_and_ratings['platform'] == platform]['score']))
    platform_avg_scores.append([platform, round(platform_score_mean,2)])

platform_avg_scores_df = pd.DataFrame(data=platform_avg_scores, columns=['platform', 'avg_score'])

platform_avg_scores_df = platform_avg_scores_df.sort_values(by='avg_score', ascending=False)
platform_avg_scores_df

fig, console_plot = plt.subplots(figsize=(20,10))
console_plot.set_xticklabels(['GC', 'PSP', 'PS2', 'Xbox', 'GBA', 'DS', 'Switch', 'X360', 'Wii', 'PS3', 'PC', 'Wii-U', 'PSVita', 'XBONE', 'PS4', 'XSX', 'PS5', '3DS'])
console_plot.set_xlabel("Platforms")
console_plot.set_ylabel("Scores")
console_plot.set_yticks([i for i in range(0,11)])
console_plot.set_title("Platform Average Scores")
console_plot.bar(platforms, list(platform_avg_scores_df['avg_score']), width=0.4)
plt.show()

/var/folders/lv/z8y9dg7d5p37pxgkd59y22ph0000gn/T/ipykernel_83434/3371689121.py:16: UserWarning: FixedFormatter should only be used together with FixedLocator
  console_plot.set_xticklabels(['GC', 'PSP', 'PS2', 'Xbox', 'GBA', 'DS', 'Switch', 'X360', 'Wii', 'PS3', 'PC', 'Wii-U', 'PSVita', 'XBONE', 'PS4', 'XSX', 'PS5', '3DS'])


#Break the data into 2 subsets, one subset containing game scores prior to 2014, and one containing scores after and including 2014
combined_scores_and_ratings_before_2014 = combined_avg_scores_and_ratings[combined_avg_scores_and_ratings['year'] < 2014]
combined_scores_and_ratings_after_2014 = combined_avg_scores_and_ratings[combined_avg_scores_and_ratings['year'] >= 2014]

#histogram plotting so we can see distributions
fig, around_2014 = plt.subplots(figsize=(10,5))
around_2014.set_xlabel("Scores")
around_2014.set_ylabel("Frequency")
around_2014.set_title("Score Distribution 2005 - 2021")
around_2014.hist(x=combined_scores_and_ratings_before_2014['score'], bins=30, color="r", density=True)
around_2014.hist(x=combined_scores_and_ratings_after_2014['score'], bins=30, color="b", density=True)

plt.show()


#statsmodel is a very useful library when you want to do hypothesis testing.

import statsmodels.api as stats
import statsmodels.formula.api as smf

#ols stands for ordinary least squares regression, as in we're fitting a regression line to it and we get related stats like our p-value. 
statistic = smf.ols(formula= 'score ~ year', data=combined_avg_scores_and_ratings).fit()

statistic.summary()

	Title	Platform(s)	Release date	Developer(s)	Ref.
0	Papyrus	Game Boy Color	February 2000	Planet Interactive Development	NaN
1	Theocracy	Linux	March 1, 2000	Philos Laboratories	NaN
2	Theocracy	Microsoft Windows	March 1, 2000	Philos Laboratories	NaN
3	Inspector Gadget: Operation Madkactus	Game Boy Color	March 14, 2000	RFX Interactive	[1]
4	Rayman 2: The Great Escape	Dreamcast	March 21, 2000	Ubi Soft	[2]

	Title	Platform(s)	Release date	Developer(s)
1232	Far Cry 6	Xbox Series X/S	2021	Ubisoft Toronto / Ubisoft Berlin / Ubisoft Kyi...
1233	Discovery Tour: Viking Age	Microsoft Windows	2021	Ubisoft Montreal
1234	Discovery Tour: Viking Age	PlayStation 4	2021	Ubisoft Montreal
1235	Far Cry 6	PlayStation 4	2021	Ubisoft Toronto / Ubisoft Berlin / Ubisoft Kyi...
1236	Monopoly Madness	Xbox One	2021	Engine Software

	titles	platforms	release_date	developers	converted_name
0	Tork: Prehistoric Punk	xbox	2005	Tiwak	tork-prehistoric-punk
1	Myst V: End of Ages	pc,mac-os	2005	Cyan Worlds	myst-v-end-of-ages
2	Marathon Manager	pc	2005	Geronimo Entertainment	marathon-manager
3	Far Cry Instincts	xbox	2005	Ubisoft Montreal	far-cry-instincts
4	Lunar: Dragon Song	ds	2005	Japan Art Media	lunar-dragon-song

	title	platform	score	ratings	year
0	Tork: Prehistoric Punk	xbox	6.2	6	2005
1	Myst V: End of Ages	pc	7.6	58	2005
2	Far Cry Instincts	xbox	6.0	82	2005
3	Lunar: Dragon Song	ds	5.3	20	2005
4	Tom Clancy's Rainbow Six: Lockdown	gamecube	7.8	5	2005
...	...	...	...	...	...
704	Call of Duty: Vanguard	playstation-5	3.5	1307	2021
705	Call of Duty: Vanguard	playstation-4	4.1	148	2021
706	Call of Duty: Vanguard	pc	3.2	644	2021
707	Call of Duty: Vanguard	xbox-one	4.0	115	2021
708	Call of Duty: Vanguard	xbox-series-x	4.7	232	2021

Dep. Variable:	score	R-squared:	0.173
Model:	OLS	Adj. R-squared:	0.172
Method:	Least Squares	F-statistic:	189.8
Date:	Tue, 10 May 2022	Prob (F-statistic):	2.37e-39
Time:	08:34:42	Log-Likelihood:	-1579.7
No. Observations:	910	AIC:	3163.
Df Residuals:	908	BIC:	3173.
Df Model:	1
Covariance Type:	nonrobust

Data scraping on video game quality and analysis

By Abdallah Hwishel and Varun Lagadapati

Introduction

Data Science Pipeline for this tutorial:

The Big Question

Has the quality of games made by major game companies dropped after 2014?

Part I: Data Collection

Part II + III: Data Visualization & Exploratory Data Analysis

Part IV: Hypothesis Testing

Part V: Communication Of Insights

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	303.7527	21.579	14.076	0.000	261.402	346.103
year	-0.1479	0.011	-13.777	0.000	-0.169	-0.127

Omnibus:	88.469	Durbin-Watson:	1.892
Prob(Omnibus):	0.000	Jarque-Bera (JB):	119.980
Skew:	-0.757	Prob(JB):	8.84e-27
Kurtosis:	3.933	Cond. No.	9.52e+05