# Load Numpy
import numpy as np
# Load MatPlotLib
import matplotlib
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
# Load Pandas
import pandas as pd
# Load SQLITE
import sqlite3
# Load Stats
from scipy import stats
# Load Seaborn
import seaborn as sns
# SKLEARN stuff
import sklearn
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn import metrics
from sklearn.neighbors import KNeighborsRegressor
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


#Creating a dataframe from csv files
#This is the data of the 1940-2020 nhl skaters
nhlSkaters = pd.read_csv('data/nhl/skater_data/skater_stats.csv', encoding='unicode_escape')

#This is the data of the 2008-2020 nhl goalies
nhlGoalies = pd.read_csv('data/nhl/goalie_data/nhl_goalie_stats.csv')

#Data cleanup
#Removing years before 2000 and after 2018
nhlSkaters.drop(nhlSkaters[nhlSkaters['Season'] < 2000].index, inplace=True)
nhlGoalies.drop(nhlGoalies[nhlGoalies['SEASON'] < 2000].index, inplace=True)
nhlSkaters.drop(nhlSkaters[nhlSkaters['Season'] > 2018].index, inplace=True)
nhlGoalies.drop(nhlGoalies[nhlGoalies['SEASON'] > 2018].index, inplace=True)

#Adding a Pos column to the nhlGoalies dataframe
nhlGoalies['Pos'] = 'G'

#Matching data headers for merging later on
nhlSkaters.rename(columns={'Season':'Year', 'Player':'Name', 'Tm': 'Team'}, inplace=True)
nhlGoalies.rename(columns={'PLAYER_NAME': 'Name', 'SEASON': 'Year', 'TM': 'Team', 'AGE': 'Age'}, inplace=True)

#Droping columns that are not needed
nhlSkaters.drop(columns = ['Unnamed: 0'], inplace = True)
nhlGoalies.drop(columns = ['PLAYER_ID', 'LG'], inplace = True)

/opt/homebrew/lib/python3.10/site-packages/IPython/core/interactiveshell.py:3398: DtypeWarning: Columns (22,23) have mixed types.Specify dtype option on import or set low_memory=False.
  exec(code_obj, self.user_global_ns, self.user_ns)


#Output the skater dataframe
nhlSkaters


#Output the goalie dataframe
nhlGoalies


#Creating a dataframe with just the goalie names
goalieNameSeason = nhlGoalies[['Name']]
goalieNameSeason

#Creating a dataframe with just the skater names
skaterNameSeason = nhlSkaters[['Name']]
skaterNameSeason

#Merging the skater and goalie names into one dataframe
nhlName = pd.concat([skaterNameSeason, goalieNameSeason], ignore_index=True)

#Making sure there are no duplicates
nhlName = nhlName.drop_duplicates()

#Output the dataframe
nhlName


#Output the dataframes created to csv
nhlSkaters.to_csv('data/nhl/skater_data/nhlSkaters.csv')
nhlGoalies.to_csv('data/nhl/goalie_data/nhlGoalies.csv')
nhlName.to_csv('webscrapers/nhlName.csv', index=False, header=False)


#Running the webscraper to get the salary data
#Website: https://www.hockeyzoneplus.com
#This takes about a day to run, so have it commented unless needed

#%env
#Will need to install the needed packages (just uncomment them)
#%pip install requests
#%pip install bs4
#%pip install lxml

#Will need to change to the correct directory of your computer
#%run /Users/ryanstevens/Documents/GitHub/CMPS-3160-Final-Project/webscrapers/nhlSalary.py


#Read in the salary csv file
nhlSalary = pd.read_csv('data/nhl/salary_data/salariesFinal.csv')

#Remove years before 2000 and after 2018
nhlSalary.drop(nhlSalary[nhlSalary['Year'] < 2000].index, inplace=True)
nhlSalary.drop(nhlSalary[nhlSalary['Year'] > 2018].index, inplace=True)

#Meging the salary dataframe with the skater and goalie dataframe
nhlSkaterSal =  nhlSkaters.merge(nhlSalary, on=['Name', 'Year'], how='left')
nhlGoalieSal =  nhlGoalies.merge(nhlSalary, on=['Name', 'Year'], how='left')

#Output the dataframes to csv
nhlSkaterSal.to_csv('data/nhl/skater_data/nhlSkaterSal.csv')
nhlGoalieSal.to_csv('data/nhl/goalie_data/nhlGoalieSal.csv')

#Print the dataframe
nhlGoalieSal
nhlSkaterSal

#Merge the skaterSal and goalieSal dataframes
nhl = pd.concat([nhlSkaterSal, nhlGoalieSal], ignore_index=True)

#Example of finding skaters and golies from the newly merged dataframe
skaters = nhl[nhl['Pos']!='G']
goalies = nhl[nhl['Pos']=='G']

#Remove rows with empty salary values
nhl.dropna(subset=['Salary'], inplace=True)

#Sort by year then name
nhl.sort_values(by=['Year', 'Name'], inplace=True)

#Output the dataframe to csv
nhl.to_csv('data/nhl/nhl.csv')


#Output nfl
nhl


#Read in stats data of each type
nfl_defensive = pd.read_csv("data/nfl/stats/Career_Stats_Defensive.csv")
nfl_field_goal = pd.read_csv("data/nfl/stats/Career_Stats_Field_Goal_Kickers.csv")
nfl_fumble = pd.read_csv("data/nfl/stats/Career_Stats_Fumbles.csv")
nfl_kick_return = pd.read_csv("data/nfl/stats/Career_Stats_Kick_Return.csv")
nfl_kickoff = pd.read_csv("data/nfl/stats/Career_Stats_Kickoff.csv")
nfl_passing = pd.read_csv("data/nfl/stats/Career_Stats_Passing.csv")
nfl_punt_return = pd.read_csv("data/nfl/stats/Career_Stats_Punt_Return.csv")
nfl_punting = pd.read_csv("data/nfl/stats/Career_Stats_Punting.csv")
nfl_receiving = pd.read_csv("data/nfl/stats/Career_Stats_Receiving.csv")
nfl_rushing = pd.read_csv("data/nfl/stats/Career_Stats_Rushing.csv")


#Method to clean up the all of the stats dataframes
def cleanData(data):
    #Remove unnecessary columns
    data.drop(columns=['Player Id'], inplace=True)
    #Fix name values
    temp = data['Name'].str.split(',')
    data['Name'] = temp.str[1] + ' ' + temp.str[0]
    #Drop years before 2000
    data.drop(data[data['Year'] <= 2000].index, inplace=True)
    #Sort by year
    data.sort_values('Year', inplace=True)
    #Drop rows with na for position
    data.dropna(subset=['Position'], inplace=True)

#Array with all dataframe names that need to be cleaned
dataToClean = [nfl_receiving, nfl_passing, nfl_rushing, nfl_defensive, nfl_fumble, nfl_field_goal, nfl_kick_return, nfl_kickoff, nfl_punt_return, nfl_punting]

#Loop for cleaning data
for i in dataToClean:
    cleanData(i)


#Removing unwanted stats from each dataframe individually
#nfl_receiving
nfl_receiving.drop(columns=['Yards Per Reception', 'Longest Reception', 'Receptions Longer than 20 Yards', 'Receptions Longer than 40 Yards', 'First Down Receptions'], inplace=True)
nfl_receiving.rename(columns={'Fumbles': 'Receiving Fumbles'}, inplace=True)
#nfl_passing
nfl_passing.drop(columns=['Passes Attempted', 'Passes Completed', 'Passing Yards Per Attempt', 'Percentage of TDs per Attempts', 'Int Rate', 'Longest Pass', 'Passes Longer than 20 Yards', 'Passes Longer than 40 Yards', 'Sacked Yards Lost'], inplace=True)
nfl_passing.rename(columns={'Ints': 'Passing Ints', 'Sacks': 'Sacked'}, inplace=True)
#nfl_rushing
nfl_rushing.drop(columns=['Yards Per Carry', 'Longest Rushing Run', 'Rushing First Downs', 'Percentage of Rushing First Downs', 'Rushing More Than 20 Yards', 'Rushing More Than 40 Yards'], inplace=True)
nfl_rushing.rename(columns={'Fumbles': 'Rushing Fumbles'}, inplace=True)
#nfl_defensive
nfl_defensive.drop(columns=['Solo Tackles', 'Assisted Tackles', 'Passes Defended', 'Ints for TDs', 'Int Yards', 'Yards Per Int', 'Longest Int Return'], inplace=True)
#nfl_fumble
nfl_fumble.drop(columns=['Own Fumble Return Yards', 'Opponent Fumble Return Yards', 'Out of Bounds Fumbles'], inplace=True)
nfl_fumble.rename(columns={'Fumbles': 'FFumbles', 'Touchbacks': 'TouchbacksFF'}, inplace=True)
#nfl_field_goal
nfl_field_goal.drop(columns=['FGs Made', 'FGs Attempted', 'FGs Made 20-29 Yards', 'FGs Attempted 20-29 Yards', 'FGs Made 30-39 Yards', 'FGs Attempted 30-39 Yards', 'FGs Made 40-49 Yards', 'FGs Attempted 40-49 Yards', 'FGs Made 50+ Yards', 'FGs Attempted 50+ Yards', 'Extra Points Attempted', 'Extra Points Made'], inplace=True)
#nfl_kick_return
nfl_kick_return.drop(columns=['Longest Return', 'Returns Longer than 20 Yards', 'Returns Longer than 40 Yards', 'Fair Catches', 'Yards Per Return'], inplace=True)
nfl_kick_return.rename(columns={'Fumbles': 'Kick Return Fumbles', 'Returns for TDs': 'Kick Return TD', 'Returns': 'Kick Returns', 'Yards Returned': 'Kick Yards Returned'}, inplace=True)
#nfl_kickoff
nfl_kickoff.drop(columns=['Out of Bounds Kickoffs'], inplace=True)
nfl_kickoff.rename(columns={'Touchbacks': 'Kickoff Touchbacks'}, inplace = True)
#nfl_punt_return
nfl_punt_return.drop(columns=['Returns Longer than 20 Yards', 'Returns Longer than 40 Yards', 'Fair Catches',  'Yards Per Return'], inplace=True)
nfl_punt_return.rename(columns={'Fumbles': 'Punt Return Fumbles', 'Returns for TDs': 'Punt Return TD', 'Returns': 'Punt Returns', 'Yards Returned': 'Punt Yards Returned'}, inplace=True)
#nfl_punting
nfl_punting.drop(columns=['Gross Punting Yards', 'Net Punting Yards', 'Longest Punt', 'Out of Bounds Punts', 'Downed Punts', 'Punts Inside 20 Yard Line', 'Fair Catches'], inplace=True)
nfl_punting.rename(columns={'Touchbacks': 'Punting Touchbacks'}, inplace=True)


# Returns the number of dashes contained in a series. Returns 0 if none are found. 
def hasDashes(col):
    res = col.str.contains("--").value_counts()
    if len(res) > 1:
        return res[1]
    else:
        return 0

# Derives and returns the mean of a numeric column, even if it contains "--" values.
def deriveMean(col):
    #a = col.sort_values(ascending = True)
    b = col.loc[col != "--"]
    c = b.astype(float)
    d = c.mean()
    return d

# Replaces all instances of "--" with the column average, as to preserve the column average while casting the series to type float. 
def replaceDashes(col, avg):
    # count = 0
    # for i in col:
    #     if i == "--":
    #         col.iloc[count] = avg
    #         count += 1

    #col.loc[col == "--"] == avg
    #df['Fee'].mask(df['Fee'] >=22000 ,'0', inplace=True)
    col.mask(col == "--", avg, inplace=True)
    
# Checks each column of a df to determine which contain dashes. Returns a list of column indexes. 
# Does not work because purely string columns cause an error while iterating. Don't know how to fix and have resorted to brute force. 
def whichColumns(df):
    cols = df.columns
    indexes = range(len(cols))
    res = []
    count = 0
    for i in indexes:
        isDash = hasDashes(df.iloc[:, count])
        if isDash > 0:
            res.append(i)
        count += 1
    return res

# Expedites data cleaning process
def autoClean(col):
    m = deriveMean(col)
    replaceDashes(col, m)


# Cleaning nfl_defensive further, 6, 7, 8
c6 = nfl_defensive.iloc[:, 6]
c7 = nfl_defensive.iloc[:, 7]
c8 = nfl_defensive.iloc[:, 8]

autoClean(c6)
autoClean(c7)
autoClean(c8)

# Cleaning nfl_receiving further, 5, 6, 7, 8, 9  
c5 = nfl_receiving.iloc[:, 5]
c6 = nfl_receiving.iloc[:, 6]
c7 = nfl_receiving.iloc[:, 7]
c8 = nfl_receiving.iloc[:, 8]
c9 = nfl_receiving.iloc[:, 9]

autoClean(c5)

count = 0
for i in c6:
    cur = c6.iloc[count]
    if "," in c6.iloc[count]:
        curList = cur.split(",")
        result = curList[0]
        result += curList[1][0]
        result += curList[1][1]
        result += curList[1][2]
        #print(result)
        nfl_receiving.iloc[:, 6].iloc[count] = result
    count += 1

autoClean(c6)
autoClean(c7)
autoClean(c8)
autoClean(c9)

# Cleaning nfl_passing further, 5, 7, 8, 9, 10, 11
c5 = nfl_passing.iloc[:, 5]
c7 = nfl_passing.iloc[:, 7]
c8 = nfl_passing.iloc[:, 8]
c9 = nfl_passing.iloc[:, 9]
c10 = nfl_passing.iloc[:, 10]
c11 = nfl_passing.iloc[:, 11]

hasDashes(c5)
autoClean(c5)

count = 0
for i in c7:
    cur = c7.iloc[count]
    if "," in c7.iloc[count]:
        curList = cur.split(",")
        result = curList[0]
        result += curList[1][0]
        result += curList[1][1]
        result += curList[1][2]
        #print(result)
        nfl_passing.iloc[:, 7].iloc[count] = result
    count += 1

autoClean(c7)
autoClean(c8)
autoClean(c9)
autoClean(c10)
autoClean(c11)

# Cleaning nfl_rushing further, 5, 6, 7, 8, 9, 10, 11
c5 = nfl_rushing.iloc[:, 5]
c6 = nfl_rushing.iloc[:, 6]
c7 = nfl_rushing.iloc[:, 7]
c9 = nfl_rushing.iloc[:, 9]
c10 = nfl_rushing.iloc[:, 10]

autoClean(c5)
autoClean(c6)

count = 0
for i in c7:
    cur = c7.iloc[count]
    if "," in c7.iloc[count]:
        curList = cur.split(",")
        result = curList[0]
        result += curList[1][0]
        result += curList[1][1]
        result += curList[1][2]
        #print(result)
        nfl_rushing.iloc[:, 7].iloc[count] = result
    count += 1

autoClean(c7)
autoClean(c8)
autoClean(c9)
autoClean(c10)
autoClean(c11)

# Cleaning nfl_fumble further, 5, 6, 7, 8, 9, 10, 11, 12
c5 = nfl_fumble.iloc[:, 5]
c6 = nfl_fumble.iloc[:, 6]
c7 = nfl_fumble.iloc[:, 7]
c8 = nfl_fumble.iloc[:, 8]
c9 = nfl_fumble.iloc[:, 9]
c10 = nfl_fumble.iloc[:, 10]
c11 = nfl_fumble.iloc[:, 11]
c12 = nfl_fumble.iloc[:, 12]

autoClean(c5)
autoClean(c6)
autoClean(c7)
autoClean(c8)
autoClean(c9)
autoClean(c10)
autoClean(c11)
autoClean(c12)

# Cleaning nfl_field_goal further, 5, 6, 7, 8, 9, 10, 11, 12, 13
c5 = nfl_field_goal.iloc[:, 5]
c6 = nfl_field_goal.iloc[:, 6]
c7 = nfl_field_goal.iloc[:, 7]
c8 = nfl_field_goal.iloc[:, 8]
c9 = nfl_field_goal.iloc[:, 9]
c10 = nfl_field_goal.iloc[:, 10]
c11 = nfl_field_goal.iloc[:, 11]
c12 = nfl_field_goal.iloc[:, 12]
c13 = nfl_field_goal.iloc[:, 13]

autoClean(c5)
autoClean(c6)
autoClean(c7)
autoClean(c8)
autoClean(c9)
autoClean(c10)
autoClean(c11)
autoClean(c12)
autoClean(c13)

# Cleaning nfl_kick_return further, 5, 6, 7, 8, 9
c5 = nfl_kick_return.iloc[:, 5]
c6 = nfl_kick_return.iloc[:, 6]
c7 = nfl_kick_return.iloc[:, 7]
c8 = nfl_kick_return.iloc[:, 8]
#c9 = nfl_kick_return.iloc[:, 9]

autoClean(c5)

count = 0
for i in c6:
    cur = c6.iloc[count]
    if "," in c6.iloc[count]:
        curList = cur.split(",")
        result = curList[0]
        result += curList[1][0]
        result += curList[1][1]
        result += curList[1][2]
        #print(result)
        nfl_kick_return.iloc[:, 6].iloc[count] = result
    count += 1

autoClean(c6)
autoClean(c7)
autoClean(c8)
#autoClean(c9)

# Cleaning nfl_kickoff further, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
c5 = nfl_kickoff.iloc[:, 5]
c6 = nfl_kickoff.iloc[:, 6]
c7 = nfl_kickoff.iloc[:, 7]
c8 = nfl_kickoff.iloc[:, 8]
c9 = nfl_kickoff.iloc[:, 9]
c10 = nfl_kickoff.iloc[:, 10]
c11 = nfl_kickoff.iloc[:, 11]
c12 = nfl_kickoff.iloc[:, 12]
c13 = nfl_kickoff.iloc[:, 13]
c14 = nfl_kickoff.iloc[:, 14]

autoClean(c5)

count = 0
for i in c6:
    cur = c6.iloc[count]
    if "," in c6.iloc[count]:
        curList = cur.split(",")
        result = curList[0]
        result += curList[1][0]
        result += curList[1][1]
        result += curList[1][2]
        #print(result)
        nfl_kickoff.iloc[:, 6].iloc[count] = result
    count += 1

autoClean(c6)
autoClean(c7)
autoClean(c8)
autoClean(c9)
autoClean(c10)
autoClean(c11)
autoClean(c12)
autoClean(c13)
autoClean(c14)

# Cleaning nfl_punt_return further, 5, 6, 7, 8, 9, 10
c5 = nfl_punt_return.iloc[:, 5]
c6 = nfl_punt_return.iloc[:, 6]
c7 = nfl_punt_return.iloc[:, 7]
c8 = nfl_punt_return.iloc[:, 8]
c9 = nfl_punt_return.iloc[:, 9]
#c10 = nfl_punt_return.iloc[:, 10]

autoClean(c5)
autoClean(c6)

count = 0
for i in c7:
    cur = c7.iloc[count]
    if "T" in c7.iloc[count]:
        curList = cur.split("T")
        result = curList[0]
        nfl_punt_return.iloc[:, 7].iloc[count] = result
    count += 1

autoClean(c7)

count = 0
for i in c8:
    cur = c8.iloc[count]
    if "T" in c8.iloc[count]:
        curList = cur.split("T")
        result = curList[0]
        nfl_punt_return.iloc[:, 8].iloc[count] = result
    count += 1

autoClean(c8)
autoClean(c9)
#autoClean(c10)

# Cleaning nfl_punting further, 5, 6, 7, 8, 9, 10, 11, 12
c5 = nfl_punting.iloc[:, 5]
c6 = nfl_punting.iloc[:, 6]
c7 = nfl_punting.iloc[:, 7]
c8 = nfl_punting.iloc[:, 8]
c9 = nfl_punting.iloc[:, 9]
c10 = nfl_punting.iloc[:, 10]
c11 = nfl_punting.iloc[:, 11]
c12 = nfl_punting.iloc[:, 12]

autoClean(c5)
autoClean(c6)
autoClean(c7)
autoClean(c8)
autoClean(c9)
autoClean(c10)
autoClean(c11)
autoClean(c12)

/opt/homebrew/lib/python3.10/site-packages/pandas/core/indexing.py:1732: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


#Merge each data set using outer merge on Name, Year, Position, Team, and Games Played
nflStats = pd.merge(nfl_passing, nfl_receiving, on=['Name', 'Year', 'Position', 'Team', 'Games Played'], how='outer')
nflStats.rename(columns={'Fumbles': 'Receiving Fumbles'}, inplace=True)
nflStats = pd.merge(nflStats, nfl_rushing, on=['Name', 'Year', 'Position', 'Team', 'Games Played'], how='outer')
nflStats.rename(columns={'Fumbles': 'Rushing Fumbles'}, inplace=True)
nflStats = pd.merge(nflStats, nfl_defensive, on=['Name', 'Year', 'Position', 'Team', 'Games Played'], how='outer')
nflStats = pd.merge(nflStats, nfl_fumble, on=['Name', 'Year', 'Position', 'Team', 'Games Played'], how='outer')
nflStats = pd.merge(nflStats, nfl_field_goal, on=['Name', 'Year', 'Position', 'Team', 'Games Played'], how='outer')
nflStats = pd.merge(nflStats, nfl_kick_return, on=['Name', 'Year', 'Position', 'Team', 'Games Played'], how='outer')
nflStats = pd.merge(nflStats, nfl_kickoff, on=['Name', 'Year', 'Position', 'Team', 'Games Played'], how='outer')
nflStats = pd.merge(nflStats, nfl_punt_return, on=['Name', 'Year', 'Position', 'Team', 'Games Played'], how='outer')
nflStats = pd.merge(nflStats, nfl_punting, on=['Name', 'Year', 'Position', 'Team', 'Games Played'], how='outer')

#Drop duplicate names and years
nflStats.drop_duplicates(subset=['Name', 'Year'], inplace=True)

#Drop na position values
nflStats.dropna(subset=['Position'], inplace=True)

#Sort by year
nflStats.sort_values('Year', inplace=True)

#Fix name values
nflStats.Name = nflStats.Name.str[1:]

#Output stats to csv
nflStats.to_csv("data/nfl/stats/nflStats.csv")


# Create subset dfs containing approproaite data, replace all NaNs with 0
nflFumbles = nflStats[["Receiving Fumbles", "Rushing Fumbles", "FFumbles", "Kick Return Fumbles", "Punt Return Fumbles"]]
nflFumbles.fillna(0, inplace = True)
nflTouchbacks = nflStats[["TouchbacksFF", "Punting Touchbacks", "Kickoff Touchbacks"]]
nflTouchbacks.fillna(0, inplace = True)
nflReturnsForTD = nflStats[[ "Kick Return TD", "Punt Return TD"]]
nflReturnsForTD.fillna(0, inplace = True)
nflReturns = nflStats[["Kick Returns", "Punt Returns"]]
nflReturns.fillna(0, inplace = True)
nflYardsReturned = nflStats[["Kick Yards Returned", "Punt Yards Returned"]]
nflYardsReturned.fillna(0, inplace = True)
nflTD = nflStats[["Receiving TDs", "Rushing TDs", "Fumble Return TDs", "Kick Return TD", "Punt Return TD"]]
nflTD.fillna(0, inplace = True)

# Prepare columns for addition 
nflFumbles = nflFumbles.apply(pd.to_numeric, errors='coerce')
nflTouchbacks = nflTouchbacks.apply(pd.to_numeric, errors='coerce')
nflReturnsForTD = nflReturnsForTD.apply(pd.to_numeric, errors='coerce')
nflReturns = nflReturns.apply(pd.to_numeric, errors='coerce')
nflYardsReturned = nflYardsReturned.apply(pd.to_numeric, errors='coerce')
nflTD = nflTD.apply(pd.to_numeric, errors='coerce')

# Create new total columns within each subset df
nflFumbles["Total Fumbles"] = nflFumbles["Receiving Fumbles"] + nflFumbles["Rushing Fumbles"] + nflFumbles["FFumbles"] + nflFumbles["Kick Return Fumbles"] + nflFumbles["Punt Return Fumbles"] 
nflTouchbacks["Total Touchbacks"] = nflTouchbacks["TouchbacksFF"] + nflTouchbacks["Punting Touchbacks"] + nflTouchbacks["Kickoff Touchbacks"]
nflReturnsForTD["Total TD Returns"] = nflReturnsForTD["Kick Return TD"] + nflReturnsForTD["Punt Return TD"] 
nflReturns["Total Returns"] = nflReturns["Kick Returns"] + nflReturns["Punt Returns"]
nflYardsReturned["Total Yards Returned"] = nflYardsReturned["Kick Yards Returned"] + nflYardsReturned["Punt Yards Returned"]
nflTD["Total TDs"] = nflTD["Receiving TDs"] + nflTD["Rushing TDs"] + nflTD["Fumble Return TDs"] + nflTD["Punt Return TD"]

# Drop original columns to faciliate future merge
nflFumbles.drop(columns=["Receiving Fumbles", "Rushing Fumbles", "FFumbles", "Kick Return Fumbles", "Punt Return Fumbles"], axis = 0, inplace=True)
nflTouchbacks.drop(columns=["TouchbacksFF", "Punting Touchbacks", "Kickoff Touchbacks"], axis = 0, inplace=True)
nflReturnsForTD.drop(columns=["Kick Return TD", "Punt Return TD"], axis=1, inplace=True)
nflReturns.drop(columns=["Kick Returns", "Punt Returns"], axis=1, inplace=True)
nflYardsReturned.drop(columns=["Kick Yards Returned", "Punt Yards Returned"], axis=1, inplace=True)
nflTD.drop(columns=["Receiving TDs", "Rushing TDs", "Fumble Return TDs", "Kick Return TD", "Punt Return TD"], axis=1, inplace=True)

# Round trailing decimal values
nflFumbles = nflFumbles.round(0)
nflTouchbacks = nflTouchbacks.round(0)
nflReturnsForTD = nflReturnsForTD.round(0)
nflReturns = nflReturns.round(0)
nflYardsReturned = nflYardsReturned.round(0)
nflTD = nflTD.round(0)

/opt/homebrew/lib/python3.10/site-packages/pandas/core/frame.py:5176: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


nflStats = pd.concat([nflStats, nflFumbles], axis=1)
nflStats = pd.concat([nflStats, nflTouchbacks], axis=1)
nflStats = pd.concat([nflStats, nflReturnsForTD], axis=1)
nflStats = pd.concat([nflStats, nflReturns], axis=1)
nflStats = pd.concat([nflStats, nflYardsReturned], axis=1)
nflStats = pd.concat([nflStats, nflTD], axis=1)


#Output stats
nflStats


#Running the webscraper to get the salary data
#Website: https://overthecap.com
#This takes about a day to run, so have it commented unless needed

#%env
#Will need to install the needed packages (just uncomment them)
#%pip install requests
#%pip install bs4
#%pip install lxml

#Will need to change to the correct directory of your computer
#%run /Users/ryanstevens/Documents/GitHub/CMPS-3160-Final-Project/webscrapers/nflSalary.py


#Read in salary csv
nflSalary = pd.read_csv('data/nfl/player_salaries/salaries.csv')

#Drop years before 2000 and after 2019
nflSalary.drop(nflSalary[nflSalary['Year'] > 2019].index, inplace=True)
nflSalary.drop(nflSalary[nflSalary['Year'] < 2000].index, inplace=True)

#Drop 0 salary values
nflSalary.drop(nflSalary[nflSalary['Salary'] == 0].index, inplace=True)

#Sort by year
nflSalary.sort_values('Year', inplace=True)

#Output salary to csv
nflSalary.to_csv("data/nfl/player_salaries/nflSalary.csv")

#Remove players with no salary data
drop = nflSalary.loc[nflSalary.Salary == '0']
drop.index
#nflSalary.drop(axis=0, index = p.index, inplace = True)

#Output salary
nflSalary


#Merge the salary and stats dataframes
nfl = pd.merge(nflStats, nflSalary)

#Output nfl to csv
nfl.to_csv("data/nfl/nfl.csv")


#Output nfl
nfl


#Creating dataframes from the csv files

#This is the data of 2000-2020 nba player salaries
nba_salaries = pd.read_csv('data/nba/player_salaries/nba-salaries.csv')

#This is the data of 2000-2020 nba player stats
nba_season = pd.read_csv('data/nba/stats/nba_season.csv')

#Matching dataframe column names
nba_season.rename(columns = {'Player':'Name', 'Year':'Season', 'Tm':'Team'}, inplace = True)
nba_salaries.rename(columns = {'season':'Season', 'name': 'Name', 'team': 'Team', 'position': 'Pos', 'salary': 'Salary'}, inplace = True)

#Removing broken values
nba_season.drop(columns = ['Unnamed: 0'], inplace = True)

#Clean out years greater than 2000 and less than 2017
def cleanYearGreater(data, year):
    return data[data['Season'] >= year]

def cleanyearLess(data, year):
    return data[data['Season'] <= year]

nba_season = cleanyearLess(nba_season, 2017)
nba_season = cleanYearGreater(nba_season, 2000)
nba_salaries = cleanyearLess(nba_salaries, 2017)
nba_salaries = cleanYearGreater(nba_salaries, 2000)

#Fixing player team data from abbreviations to full names
nba_abbrevs = {'BOS': 'Boston Celtics', 'BRK': 'Brooklyn Nets', 'NYK': 'New York Knicks', 'PHI': 'Philadelphia 76ers', 'TOR': 'Toronto Raptors',
                 'CHI': 'Chicago Bulls', 'CLE': 'Cleveland Cavaliers', 'DET': 'Detroit Pistons', 'IND': 'Indiana Pacers', 'MIL': 'Milwaukee Bucks', 
                 'ATL': 'Atlanta Hawks', 'CHA': 'Charlotte Hornets', 'MIA': 'Miami Heat', 'ORL': 'Orlando Magic', 'WAS': 'Washington Wizards', 
                 'DEN': 'Denver Nuggets', 'MIN': 'Minnesota Timberwolves', 'OKC': 'Oklahoma City Thunder', 'POR': 'Portland Trail Blazers', 
                 'UTA': 'Utah Jazz', 'GSW': 'Golden State Warriors', 'LAC': 'Los Angeles Clippers', 'LAL': 'Los Angeles Lakers', 'PHO': 'Phoenix Suns', 
                 'SAC': 'Sacramento Kings', 'DAL': 'Dallas Mavericks', 'HOU': 'Houston Rockets', 'MEM': 'Memphis Grizzlies', 'NOP': 'New Orleans Pelicans', 
                 'SAS': 'San Antonio Spurs', 'VAN': 'Vancouver Grizzlies', 'TOT': 'Two other teams', 'CHO': 'Charlotte Hornets', 'NJN': 'New Jersey Nets', 
                 'SEA': 'Seattle SuperSonics', 'CHH': 'Charlotte Hornets'}

#Function to replace team abbreviations with full names
def cleanTeam(data, teams):
    for i in data['Team']:
        for j in teams:
            if i == j:
                data['Team'].replace(i, nba_abbrevs[j], inplace=True)

cleanTeam(nba_season, nba_abbrevs)

#Making a salary dataframe without rank, team, and position for merging
nba_salMerge = nba_salaries
nba_salMerge.drop(columns = ['rank', 'Team', 'Pos'], inplace = True)
nba_salMerge

#Now we can merge the two dataframes using the name and season column
nba = nba_season.merge(nba_salMerge, on=['Name', 'Season'], how='left')

#Fixing year values by subtracting 1 from each rows season value
nba['Season'] = nba['Season'] - 1

#Dropping rows with null values for salary
nba.dropna(subset = ['Salary'], inplace = True)

#Dropping unused columns
nba.drop(columns = ['blanl', 'blank2'], inplace = True)

#Converting season to int
nba['Season'] = nba['Season'].astype(int)

#Output to csv
nba.to_csv('data/nba/nba.csv')


#Output nba
nba


#Create temporary DF to assign back several original columns
nhlTemp = nhl
nhlTemp


#Convert all strings to floats, incidentally replaces 'Year', 'Name' and 'Pos'
nhl= nhl.apply(pd.to_numeric, errors='coerce')

#Replace NaNs with column average
nhl.fillna(nhl.apply(lambda x: x.mean()), inplace=True)

#Fill in unintentionally erased columns
nhl["Name"] = nhlTemp["Name"]
nhl["Team"] = nhlTemp["Team"]
nhl["Pos"] = nhlTemp["Pos"]

nhl


nflTemp = nfl
nflTemp


#Convert all strings to floats, incidentally replaces 'Year', 'Name' and 'Position'
nfl= nfl.apply(pd.to_numeric, errors='coerce')

#Replace NaNs with column average
nfl.fillna(nfl.apply(lambda x: x.mean()), inplace=True)

#Fill in unintentionally erased columns
nfl["Name"] = nflTemp["Name"]
nfl["Position"] = nflTemp["Position"]
nfl["Team"] = nflTemp["Team"]

nfl


nbaTemp = nba
nba


#Convert all strings to floats, incidentally replaces 'Year', 'Name' and 'Pos'
nba = nba.apply(pd.to_numeric, errors='coerce')

#Replace NaNs with column average
nba.fillna(nba.apply(lambda x: x.mean()), inplace=True)

#Fill in unintentionally erased columns
nba["Name"] = nbaTemp["Name"]
nba["Pos"] = nbaTemp["Pos"]
nba["Team"] = nbaTemp["Team"]

nba


nhl.to_csv('data/nhl/nhlMeanFinal.csv')
nfl.to_csv('data/nfl/nflMeanFinal.csv')
nba.to_csv('data/nba/nbaMeanFinal.csv')


nflGraph = pd.DataFrame().assign(Name = nfl["Name"], Year = nfl["Year"], Salary = nfl["Salary"])
nflGraph.sort_values(by = "Year", ascending = True, inplace = True)
nflGraph = nflGraph.astype({"Year": "int", "Salary": "int"})

yearlyAvg = []
yearlyAvg.append(nflGraph.loc[nflGraph["Year"] == 2001].Salary.mean())

years = [2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016]

i = 2002
while i <= 2016:
    newAvg = pd.DataFrame()
    newAvg = nflGraph.loc[nflGraph["Year"] == i].Salary.mean()
    i += 1
    yearlyAvg.append(newAvg)

nflPlot = pd.DataFrame({"Year":years, "Salary":yearlyAvg})
nflPlot.Salary = nflPlot.Salary.round()

#Prep NBA Data
nbaGraph = nba.loc[nba["Season"] >= 2001]
nbaGraph = nbaGraph.loc[nbaGraph["Season"] <= 2016]
nbaGraph = pd.DataFrame().assign(Name = nbaGraph["Name"], Year = nbaGraph["Season"], Salary = nbaGraph["Salary"])
nbaGraph.sort_values(by = "Year", ascending = True, inplace = True)
nbaGraph = nbaGraph.astype({"Year": "int", "Salary": "int"})

yearlyAvg2 = []
yearlyAvg2.append(nbaGraph.loc[nbaGraph["Year"] == 2001].Salary.mean())

i = 2002
while i <= 2016:
    newAvg = pd.DataFrame()
    newAvg = nbaGraph.loc[nbaGraph["Year"] == i].Salary.mean()
    i += 1
    yearlyAvg2.append(newAvg)

nbaPlot = pd.DataFrame({"Year":years, "Salary":yearlyAvg2})
nbaPlot.Salary = nbaPlot.Salary.round()

#Prep NHL Data
nhlGraph = nhl.loc[nhl["Year"] >= 2001]
nhlGraph = nhlGraph.loc[nhlGraph["Year"] <= 2016]
nhlGraph = pd.DataFrame().assign(Name = nhlGraph["Name"], Year = nhlGraph["Year"], Salary = nhlGraph["Salary"])
nhlGraph.sort_values(by = "Year", ascending = True, inplace = True)
nhlGraph = nhlGraph.astype({"Year": "int", "Salary": "int"})

yearlyAvg3 = []
yearlyAvg3.append(nhlGraph.loc[nhlGraph["Year"] == 2001].Salary.mean())

i = 2002
while i <= 2016:
    newAvg = pd.DataFrame()
    newAvg = nhlGraph.loc[nhlGraph["Year"] == i].Salary.mean()
    i += 1
    yearlyAvg3.append(newAvg)

nhlPlot = pd.DataFrame({"Year":years, "Salary":yearlyAvg3})
nhlPlot.Salary = nhlPlot.Salary.round()

#Plot the Data
plt.figure(figsize=(20,10))
plt.plot(nflPlot["Year"], nflPlot["Salary"], color = "Red")
plt.plot(nbaPlot["Year"], nbaPlot["Salary"], color = "Blue")
plt.plot(nhlPlot["Year"], nhlPlot["Salary"], color = "Green")
plt.title("Average Salary by Year")
plt.xlabel("Year")
plt.ylabel("Salary")
plt.legend(["NFL", "NBA", "NHL"])
plt.show()


#NHL
nhlTemp = pd.DataFrame().assign(Name = nhl["Name"], Year = nhl["Year"], Position = nhl["Pos"], Salary = nhl["Salary"])
positions = ["C/LW", "LW/D", "LW/RW", "RW/C", "RW/LW"]
for i in range(len(positions)):
    nhlTemp.drop(nhlTemp.loc[nhlTemp["Position"] == positions[i]].index, inplace = True)
nhlTemp.sort_values(by = "Year", ascending = True, inplace = True)
nhlTemp = nhlTemp.astype({"Year": "int", "Salary": "int"})

nhl_cube = nhlTemp.pivot_table(index = "Year", columns=["Position"], values = "Salary", aggfunc=np.mean)
nhl_cube.plot.line(figsize=(20,10), title = "Average Salary by Position by Year", xlabel = "Year", ylabel = "Salary", rot = 0)

nhl_table = (nhlTemp.groupby(["Year", "Position"])).Salary.mean().round()
nhl_table.to_frame()


#NFL
nflTemp = pd.DataFrame().assign(Name = nfl["Name"], Year = nfl["Year"], Position = nfl["Position"], Salary = nfl["Salary"])
nflTemp.sort_values(by = "Year", ascending = True, inplace = True)
nflTemp = nflTemp.astype({"Year": "int", "Salary": "int"})

nfl_cube = nflTemp.pivot_table(index = "Year", columns=["Position"], values = "Salary", aggfunc=np.mean)
nfl_cube.plot.line(figsize=(20,10), title = "Average Salary by Position by Year", xlabel = "Year", ylabel = "Salary", rot = 0)

nfl_table = (nflTemp.groupby(["Year", "Position"])).Salary.mean().round()
nfl_table.to_frame()


#NBA
nbaTemp = nba.loc[nba["Season"] >= 2001]
nbaTemp = nbaTemp.loc[nbaTemp["Season"] <= 2016]
nbaTemp = pd.DataFrame().assign(Name = nba["Name"], Year = nba["Season"], Position = nba["Pos"], Salary = nba["Salary"])
nbaTemp.sort_values(by = "Year", ascending = True, inplace = True)
nbaTemp = nbaTemp.astype({"Year": "int", "Salary": "int"})

nba_cube = nbaTemp.pivot_table(index = "Year", columns=["Position"], values = "Salary", aggfunc=np.mean)
nba_cube.plot.line(figsize=(20,10), title = "Average Salary by Position by Year", xlabel = "Year", ylabel = "Salary", rot = 0)

nba_table = (nbaTemp.groupby(["Year", "Position"])).Salary.mean().round()
nba_table.to_frame()


#NHL
nhlTemp = pd.DataFrame().assign(Name = nhl["Name"], Year = nhl["Year"], Team = nhl["Team"], Salary = nhl["Salary"])
nhlTemp.sort_values(by = "Year", ascending = True, inplace = True)
nhlTemp = nhlTemp.astype({"Year": "int", "Salary": "int"})

nhl_cube = nhlTemp.pivot_table(index = "Year", columns=["Team"], values = "Salary", aggfunc=np.mean)
nhl_cube.plot.line(figsize=(20,10), title = "Average Salary by Team by Year", xlabel = "Year", ylabel = "Salary", rot = 0)

nhl_table = (nhlTemp.groupby(["Year", "Team"])).Salary.mean().round()
nhl_table.to_frame()


#NFL
nflTemp = pd.DataFrame().assign(Name = nfl["Name"], Year = nfl["Year"], Team = nfl["Team"], Salary = nfl["Salary"])
nflTemp.sort_values(by = "Year", ascending = True, inplace = True)
nflTemp = nflTemp.astype({"Year": "int", "Salary": "int"})

nfl_cube = nflTemp.pivot_table(index = "Year", columns=["Team"], values = "Salary", aggfunc=np.mean)
nfl_cube.plot.line(figsize=(20,10), title = "Average Salary by Team by Year", xlabel = "Year", ylabel = "Salary", rot = 0)

nfl_table = (nflTemp.groupby(["Year", "Team"])).Salary.mean().round()
nfl_table.to_frame()


#NBA
nbaTemp = nba.loc[nba["Season"] >= 2001]
nbaTemp = nbaTemp.loc[nbaTemp["Season"] <= 2016]
nbaTemp = pd.DataFrame().assign(Name = nba["Name"], Year = nba["Season"], Team = nba["Team"], Salary = nba["Salary"])
nbaTemp.sort_values(by = "Year", ascending = True, inplace = True)
nbaTemp = nbaTemp.astype({"Year": "int", "Salary": "int"})

nba_cube = nbaTemp.pivot_table(index = "Year", columns=["Team"], values = "Salary", aggfunc=np.mean)
nba_cube.plot.line(figsize=(20,10), title = "Average Salary by Team by Year", xlabel = "Year", ylabel = "Salary", rot = 0)

nba_table = (nbaTemp.groupby(["Year", "Team"])).Salary.mean().round()
nba_table.to_frame()


# Will need to make a new column called total tds and then add all the types of tds together
# Combine TD Passes, Rushing TDs, Receiving TDs, Ints for TDs, Fumble Return TDs, Returns for TDs_x, Returns for TDs_y, TDs Returned on Punt


nhlCmatrix = nhl.corr()
plt.figure(figsize=(60,30))
sns.heatmap(nhlCmatrix, annot=True)
plt.show()


nflCmatrix = nfl.corr()
plt.figure(figsize=(100,50))
sns.heatmap(nflCmatrix, annot=True)
plt.show()


nbaCmatrix = nba.corr()
plt.figure(figsize=(60,30))
sns.heatmap(nbaCmatrix, annot=True)
plt.show()


nhlScat = nhl.drop(columns=["ATOI"])
scatter_matrix(nhlScat, figsize=(60,30))
plt.show()


scatter_matrix(nfl, figsize=(100,50))
plt.show()


scatter_matrix(nba, figsize=(60,30))
plt.show()


# Pre-processing
nhlMod = nhl
nhlMod = nhl.sort_values(by = ["Name", "Year"], ascending=True)
nhlMod = nhl.drop(columns=["Salary", "Name", "Team", "Pos", "ATOI"], axis=1)


# GRAPHING PREDICTED VERSUS ACTUAL SALARIES

# Initialize variables
features = nhlMod.columns
X_dict = nhlMod[features].to_dict(orient="records")
X_train_dict = nhlMod[features].to_dict(orient="records")

y = nhl["Salary"]
y = y.astype({"Salary": "int"})
y_train = nhl["Salary"]
y_train = y_train.astype({"Salary": "int"})

# Dummy encoding
vec = DictVectorizer(sparse=False)
vec.fit(X_train_dict)
X_train = vec.transform(X_train_dict)

# Standardization
scaler = StandardScaler()
scaler.fit(X_train)
X_train_sc = scaler.transform(X_train)

res = []
for index, row in nhlMod.iterrows():
    X_new_dict = row.to_dict() 
    X_new = vec.transform(X_new_dict)
    X_new_sc = scaler.transform(X_new)

    # K-Nearest Neighbors Model
    model = KNeighborsRegressor(n_neighbors=10)
    model.fit(X_train_sc, y_train)
    newPred = model.predict(X_new_sc).round()
    res.append(newPred)

nhlCompare = pd.DataFrame(nhl.Salary, columns=["Salary"])
preds = pd.DataFrame(res, columns=["Predictions"])
nhlCompare = pd.concat([nhlCompare, preds], axis=1)
nhlCompare.plot.line(figsize=(30,20))

<AxesSubplot:>


#GRAPHING MSE 

# Initialize variables
features = nhlMod.columns
X_dict = nhlMod[features].to_dict(orient="records")
X_train_dict = nhlMod[features].to_dict(orient="records")
X_new_dict = nhlMod.iloc[0].to_dict()

y = nhl["Salary"]
y = y.astype({"Salary": "int"})
y_train = nhl["Salary"]
y_train = y_train.astype({"Salary": "int"})

# Dummy encoding
vec = DictVectorizer(sparse=False)
vec.fit(X_train_dict)
X_train = vec.transform(X_train_dict)
X_new = vec.transform(X_new_dict)

# Standardization
scaler = StandardScaler()
scaler.fit(X_train)
X_train_sc = scaler.transform(X_train)
X_new_sc = scaler.transform(X_new)

# K-Nearest Neighbors Model
model = KNeighborsRegressor(n_neighbors=10)
model.fit(X_train_sc, y_train)
display("The model predicts that the player at index zero will make: %s" % model.predict(X_new_sc)[0].round())

#Compare results
nhlVerify = nhl.iloc[0].Salary
display("The actual salary of the player at index zero is: %s" % nhlVerify)

#Calculates estimate of test error based on 10-fold cross validation
def get_cv_error(k):
    model = KNeighborsRegressor(n_neighbors=k)
    pipeline = Pipeline([("vectorizer", vec), ("scaler", scaler), ("fit", model)])
    result = (-(cross_val_score(pipeline, X_dict, y, cv=10, scoring="neg_mean_absolute_error"))).mean()
    return result
    
# Calculate test error per number of K-neighbors used
ks = pd.Series(range(1, 20))
ks.index = range(1, 20)
test_errs = ks.apply(get_cv_error)
test_errs.plot.line()

'The model predicts that the player at index zero will make: 577350.0'

'The actual salary of the player at index zero is: 500000.0'

<AxesSubplot:>


# Pre-processing
nflMod = nfl
nflMod = nfl.sort_values(by = ["Name", "Year"], ascending=True)
nflMod = nfl.drop(columns=["Salary", "Name", "Team", "Position"], axis=1)


# GRAPHING PREDICTED VERSUS ACTUAL SALARIES

# Initialize variables
features = nflMod.columns
X_dict = nflMod[features].to_dict(orient="records")
X_train_dict = nflMod[features].to_dict(orient="records")

y = nfl["Salary"]
y = y.astype({"Salary": "int"})
y_train = nfl["Salary"]
y_train = y_train.astype({"Salary": "int"})

# Dummy encoding
vec = DictVectorizer(sparse=False)
vec.fit(X_train_dict)
X_train = vec.transform(X_train_dict)
X_new = vec.transform(X_new_dict)

# Standardization
scaler = StandardScaler()
scaler.fit(X_train)
X_train_sc = scaler.transform(X_train)
X_new_sc = scaler.transform(X_new)

res = []
for index, row in nflMod.iterrows():
    X_new_dict = row.to_dict() 
    X_new = vec.transform(X_new_dict)
    X_new_sc = scaler.transform(X_new)

    # K-Nearest Neighbors Model
    model = KNeighborsRegressor(n_neighbors=10)
    model.fit(X_train_sc, y_train)
    newPred = model.predict(X_new_sc).round()
    res.append(newPred)

nflCompare = pd.DataFrame(nfl.Salary, columns=["Salary"])
preds = pd.DataFrame(res, columns=["Predictions"])
nflCompare = pd.concat([nflCompare, preds], axis=1)
nhlCompare.plot.line(figsize=(30,20))

<AxesSubplot:>


#GRAPHING MSE

# Initialize variables
features = nflMod.columns
X_dict = nflMod[features].to_dict(orient="records")
X_train_dict = nflMod[features].to_dict(orient="records")
X_new_dict = nflMod.iloc[0].to_dict()

y = nfl["Salary"]
y = y.astype({"Salary": "int"})
y_train = nfl["Salary"]
y_train = y_train.astype({"Salary": "int"})

# Dummy encoding
vec = DictVectorizer(sparse=False)
vec.fit(X_train_dict)
X_train = vec.transform(X_train_dict)
X_new = vec.transform(X_new_dict)

# Standardization
scaler = StandardScaler()
scaler.fit(X_train)
X_train_sc = scaler.transform(X_train)
X_new_sc = scaler.transform(X_new)

# # K-Nearest Neighbors Model
model = KNeighborsRegressor(n_neighbors=30)
model.fit(X_train_sc, y_train)
display("The model predicts that the player at index zero will make: %s" % model.predict(X_new_sc)[0].round())

#Compare results
nflVerify = nfl.iloc[0].Salary
display("The actual salary of the player at index zero is: %s" % nflVerify)

#Calculates estimate of test error based on 10-fold cross validation
def get_cv_error(k):
    model = KNeighborsRegressor(n_neighbors=k)
    pipeline = Pipeline([("vectorizer", vec), ("scaler", scaler), ("fit", model)])
    result = (-(cross_val_score(pipeline, X_dict, y, cv=10, scoring="neg_mean_absolute_error"))).mean()
    return result

# Calculate test error per number of K-neighbors used  
ks = pd.Series(range(1, 20))
ks.index = range(1, 20)
test_errs = ks.apply(get_cv_error)
test_errs.plot.line()

'The model predicts that the player at index zero will make: 1460189.0'

'The actual salary of the player at index zero is: 298000'

<AxesSubplot:>


# Pre-processing
nbaMod = nba
nbaMod = nba.sort_values(by = ["Name", "Season"], ascending=True)
nbaMod.drop(columns=["Salary", "Name", "Team", "Pos"], axis=1, inplace=True)


# GRAPHING PREDICTED VERSUS ACTUAL SALARIES

# Initialize variables
features = nbaMod.columns
X_dict = nbaMod[features].to_dict(orient="records")
X_train_dict = nbaMod[features].to_dict(orient="records")

y = nba["Salary"]
y = y.astype({"Salary": "int"})
y_train = nba["Salary"]
y_train = y_train.astype({"Salary": "int"})

# Dummy encoding
vec = DictVectorizer(sparse=False)
vec.fit(X_train_dict)
X_train = vec.transform(X_train_dict)
X_new = vec.transform(X_new_dict)

# Standardization
scaler = StandardScaler()
scaler.fit(X_train)
X_train_sc = scaler.transform(X_train)
X_new_sc = scaler.transform(X_new)

res = []
for index, row in nbaMod.iterrows():
    X_new_dict = row.to_dict() 
    X_new = vec.transform(X_new_dict)
    X_new_sc = scaler.transform(X_new)

    # K-Nearest Neighbors Model
    model = KNeighborsRegressor(n_neighbors=10)
    model.fit(X_train_sc, y_train)
    newPred = model.predict(X_new_sc).round()
    res.append(newPred)

nbaCompare = pd.DataFrame(nba.Salary, columns=["Salary"])
preds = pd.DataFrame(res, columns=["Predictions"])
nbaCompare = pd.concat([nbaCompare, preds], axis=1)
nbaCompare.plot.line(figsize=(30,20))

<AxesSubplot:>


#GRAPHING MSE

# Initialize variables
features = nbaMod.columns
X_dict = nbaMod[features].to_dict(orient="records")
X_train_dict = nbaMod[features].to_dict(orient="records")
X_new_dict = nbaMod.iloc[0].to_dict()

y = nba["Salary"]
y = y.astype({"Salary": "int"})
y_train = nba["Salary"]
y_train = y_train.astype({"Salary": "int"})

# Dummy encoding
vec = DictVectorizer(sparse=False)
vec.fit(X_train_dict)
X_train = vec.transform(X_train_dict)
X_new = vec.transform(X_new_dict)

# Standardization
scaler = StandardScaler()
scaler.fit(X_train)
X_train_sc = scaler.transform(X_train)
X_new_sc = scaler.transform(X_new)

# K-Nearest Neighbors Model
model = KNeighborsRegressor(n_neighbors=250)
model.fit(X_train_sc, y_train)
display("The model predicts that the player at index zero will make: %s" % model.predict(X_new_sc)[0].round())

#Compare results
nbaVerify = nba.iloc[0].Salary
display("The actual salary of the player at index zero: %s" % nbaVerify)

#Calculates estimate of test error based on 10-fold cross validation
def get_cv_error(k):
    model = KNeighborsRegressor(n_neighbors=k)
    pipeline = Pipeline([("vectorizer", vec), ("scaler", scaler), ("fit", model)])
    result = (-(cross_val_score(pipeline, X_dict, y, cv=10, scoring="neg_mean_absolute_error"))).mean()
    return result
    
ks = pd.Series(range(1, 20))
ks.index = range(1, 20)
test_errs = ks.apply(get_cv_error)
test_errs.plot.line()

'The model predicts that the player at index zero will make: 4005482.0'

'The actual salary of the player at index zero: 9000000.0'

<AxesSubplot:>


#Pre-processing
nhl.reset_index(drop=True, inplace=True)
nhlZtemp = nhl.copy()
nhlZtemp.drop(columns=['Name', 'Team', 'Age', 'Pos', 'Year', 'Salary', "GP"], inplace=True)
nhlZtemp = stats.zscore(nhlZtemp)
nhlZtemp['PStat'] = nhlZtemp.sum(axis=1).round()
NHLpstats = nhlZtemp['PStat']
NHLpstats = pd.DataFrame(NHLpstats)
nhl = pd.concat([nhl, NHLpstats], axis=1)

nhlMod = nhl
nhlMod = nhl.sort_values(by = ["Name", "Year"], ascending=True)
nhlMod.drop(columns="ATOI", inplace=True)


# GRAPHING PREDICTED VERSUS ACTUAL SALARIES

#Initialize variables
features = ["GP", "PStat"]
X_dict = nhlMod[features].to_dict(orient="records")
X_train_dict = nhlMod[features].to_dict(orient="records")

y = nhl["Salary"]
y = y.astype({"Salary": "int"})
y_train = nhl["Salary"]
y_train = y_train.astype({"Salary": "int"})

# Dummy encoding
vec = DictVectorizer(sparse=False)
vec.fit(X_train_dict)
X_train = vec.transform(X_train_dict)

# Standardization
scaler = StandardScaler()
scaler.fit(X_train)
X_train_sc = scaler.transform(X_train)

res = []
for index, row in nhlMod.iterrows():
    X_new_dict = row.to_dict() 
    X_new = vec.transform(X_new_dict)
    X_new_sc = scaler.transform(X_new)

    # K-Nearest Neighbors Model
    model = KNeighborsRegressor(n_neighbors=10)
    model.fit(X_train_sc, y_train)
    newPred = model.predict(X_new_sc).round()
    res.append(newPred)

nhlCompare = pd.DataFrame(nhl.Salary, columns=["Salary"])
preds = pd.DataFrame(res, columns=["Predictions"])
nhlCompare = pd.concat([nhlCompare, preds], axis=1)
nhlCompare.plot.line(figsize=(30,20))

<AxesSubplot:>


# GRAPHING TEST ERROR

# Initialize variables
features = ["GP", "PStat"]
X_dict = nhlMod[features].to_dict(orient="records")
X_train_dict = nhlMod[features].to_dict(orient="records")
X_new_dict = nhlMod[features].iloc[0].to_dict()

y = nhl["Salary"]
y = y.astype({"Salary": "int"})
y_train = nhl["Salary"]
y_train = y_train.astype({"Salary": "int"})

# Dummy encoding
vec = DictVectorizer(sparse=False)
vec.fit(X_train_dict)
X_train = vec.transform(X_train_dict)
X_new = vec.transform(X_new_dict)

# Standardization
scaler = StandardScaler()
scaler.fit(X_train)
X_train_sc = scaler.transform(X_train)
X_new_sc = scaler.transform(X_new)

# K-Nearest Neighbors Model
model = KNeighborsRegressor(n_neighbors=30)
model.fit(X_train_sc, y_train)
display("The model predicts that for the PStat value of the player at index zero, the player will make: %s" % model.predict(X_new_sc).round()[0])

#Compare results
nhlVerify = nhl.iloc[0].Salary
display("The actual PStat value of the player at index zero: %s" % nhlVerify)

#Calculates estimate of test error based on 10-fold cross validation
def get_cv_error(k):
    model = KNeighborsRegressor(n_neighbors=k)
    pipeline = Pipeline([("vectorizer", vec), ("scaler", scaler), ("fit", model)])
    result = (-(cross_val_score(pipeline, X_dict, y, cv=10, scoring="neg_mean_absolute_error"))).mean()
    return result
    
ks = pd.Series(range(1, 20))
ks.index = range(1, 20)
test_errs = ks.apply(get_cv_error)
test_errs.plot.line()

'The model predicts that for the PStat value of the player at index zero, the player will make: 1991839.0'

'The actual PStat value of the player at index zero: 500000.0'

<AxesSubplot:>


#Pre-processing
nfl.reset_index(drop=True, inplace=True)
nflZtemp = nfl.copy()
nflZtemp.drop(columns=['Name', 'Team', 'Position', 'Year', 'Salary'], inplace=True)
nflZtemp = stats.zscore(nflZtemp)
nflZtemp['PStat'] = nflZtemp.sum(axis=1)
NFLpstats = nflZtemp['PStat']
NFLpstats = pd.DataFrame(NFLpstats)
nfl = pd.concat([nfl, NFLpstats], axis=1)

nflMod = nfl
nflMod = nfl.sort_values(by = ["Name", "Year"], ascending=True)


#GRAPHING PREDICTED VERSUS ACTUAL SALARIES

# Initialize variable
features = ["Games Played", "PStat"]
X_dict = nflMod[features].to_dict(orient="records")
X_train_dict = nflMod[features].to_dict(orient="records")

y = nfl["Salary"]
y = y.astype({"Salary": "int"})
y_train = nfl["Salary"]
y_train = y_train.astype({"Salary": "int"})

# Dummy encoding
vec = DictVectorizer(sparse=False)
vec.fit(X_train_dict)
X_train = vec.transform(X_train_dict)

# Standardization
scaler = StandardScaler()
scaler.fit(X_train)
X_train_sc = scaler.transform(X_train)

res = []
for index, row in nflMod.iterrows():
    X_new_dict = row.to_dict() 
    X_new = vec.transform(X_new_dict)
    X_new_sc = scaler.transform(X_new)

    # K-Nearest Neighbors Model
    model = KNeighborsRegressor(n_neighbors=10)
    model.fit(X_train_sc, y_train)
    newPred = model.predict(X_new_sc).round()
    res.append(newPred)

nflCompare = pd.DataFrame(nfl.Salary, columns=["Salary"])
preds = pd.DataFrame(res, columns=["Predictions"])
nflCompare = pd.concat([nflCompare, preds], axis=1)
nflCompare.plot.line(figsize=(30,20))

<AxesSubplot:>


# GRAPHING MSE

# Initialize variables
features = ["Games Played", "PStat"]
X_dict = nflMod[features].to_dict(orient="records")
X_train_dict = nflMod[features].to_dict(orient="records")
X_new_dict = nflMod[features].iloc[0].to_dict()

y = nfl["Salary"]
y = y.astype({"Salary": "int"})
y_train = nfl["Salary"]
y_train = y_train.astype({"Salary": "int"})

# Dummy encoding
vec = DictVectorizer(sparse=False)
vec.fit(X_train_dict)
X_train = vec.transform(X_train_dict)
X_new = vec.transform(X_new_dict)

# Standardization
scaler = StandardScaler()
scaler.fit(X_train)
X_train_sc = scaler.transform(X_train)
X_new_sc = scaler.transform(X_new)

# # K-Nearest Neighbors Model
model = KNeighborsRegressor(n_neighbors=30)
model.fit(X_train_sc, y_train)
display("The model predicts that for the PStat value of the player at index zero, the player will make: %s" % model.predict(X_new_sc).round()[0])

#Compare results
nflVerify = nfl.iloc[0].PStat
display("The actual PStat value of the player at index zero: %s" % nhlVerify)

#Calculates estimate of test error based on 10-fold cross validation
def get_cv_error(k):
    model = KNeighborsRegressor(n_neighbors=k)
    pipeline = Pipeline([("vectorizer", vec), ("scaler", scaler), ("fit", model)])
    result = (-(cross_val_score(pipeline, X_dict, y, cv=10, scoring="neg_mean_absolute_error"))).mean()
    return result
    
ks = pd.Series(range(1, 20))
ks.index = range(1, 20)
test_errs = ks.apply(get_cv_error)
test_errs.plot.line()

'The model predicts that for the PStat value of the player at index zero, the player will make: 3170443.0'

'The actual PStat value of the player at index zero: 500000.0'

<AxesSubplot:>


# Pre-processing
nba.reset_index(drop=True, inplace=True)
nbaZtemp = nba.copy()
nbaZtemp.drop(columns=['Name', 'Team', 'Pos', 'Season', 'Salary', 'Age'], inplace=True)
nbaZtemp = stats.zscore(nbaZtemp)
nbaZtemp['PStat'] = nbaZtemp.sum(axis=1)
NBApstats = nbaZtemp['PStat']
NBApstats = pd.DataFrame(NBApstats)
nba = pd.concat([nba, NBApstats], axis=1)

nbaMod = nba
nbaMod = nba.sort_values(by = ["Name", "Season"], ascending=True)


# GRAPHING PREDICTED VERSUS ACTUAL SALARIES

# Initialize variables
features = ["G", "PStat"]
X_dict = nbaMod[features].to_dict(orient="records")
X_train_dict = nbaMod[features].to_dict(orient="records")

y = nba["Salary"]
y = y.astype({"Salary": "int"})
y_train = nba["Salary"]
y_train = y_train.astype({"Salary": "int"})

# Dummy encoding
vec = DictVectorizer(sparse=False)
vec.fit(X_train_dict)
X_train = vec.transform(X_train_dict)

# Standardization
scaler = StandardScaler()
scaler.fit(X_train)
X_train_sc = scaler.transform(X_train)

res = []
for index, row in nbaMod.iterrows():
    X_new_dict = row.to_dict() 
    X_new = vec.transform(X_new_dict)
    X_new_sc = scaler.transform(X_new)

    # K-Nearest Neighbors Model
    model = KNeighborsRegressor(n_neighbors=10)
    model.fit(X_train_sc, y_train)
    newPred = model.predict(X_new_sc).round()
    res.append(newPred)

nbaCompare = pd.DataFrame(nba.Salary, columns=["Salary"])
preds = pd.DataFrame(res, columns=["Predictions"])
nbaCompare = pd.concat([nbaCompare, preds], axis=1)
nbaCompare.plot.line(figsize=(30,20))

<AxesSubplot:>


# GRAPHING MSE

# Initalize variables
features = ["PStat"]
X_dict = nbaMod[features].to_dict(orient="records")
X_train_dict = nbaMod[features].to_dict(orient="records")
X_new_dict = nbaMod[features].iloc[0].to_dict()

y = nba["Salary"]
y_train = nba["Salary"]
y_train = y_train.astype({"Salary": "int"})
y = y.astype({"Salary": "int"})

# Dummy encoding
vec = DictVectorizer(sparse=False)
vec.fit(X_train_dict)
X_train = vec.transform(X_train_dict)
X_new = vec.transform(X_new_dict)

# Standardization
scaler = StandardScaler()
scaler.fit(X_train)
X_train_sc = scaler.transform(X_train)
X_new_sc = scaler.transform(X_new)

# # K-Nearest Neighbors Model
model = KNeighborsRegressor(n_neighbors=30)
model.fit(X_train_sc, y_train)
display("The model predicts that for the PStat value of the player at index zero, the player will make: %s" % model.predict(X_new_sc).round()[0])

#Compare results
nbaVerify = nba.iloc[0].PStat
display("The actual PStat value of the player at index zero: %s" % nbaVerify)

#Calculates estimate of test error based on 10-fold cross validation
def get_cv_error(k):
    model = KNeighborsRegressor(n_neighbors=k)
    pipeline = Pipeline([("vectorizer", vec), ("scaler", scaler), ("fit", model)])
    result = (-(cross_val_score(pipeline, X_dict, y, cv=10, scoring="neg_mean_absolute_error"))).mean()
    return result
    
ks = pd.Series(range(1, 20))
ks.index = range(1, 20)
test_errs = ks.apply(get_cv_error)
test_errs.plot.line()

'The model predicts that for the PStat value of the player at index zero, the player will make: 3027492.0'

'The actual PStat value of the player at index zero: 63.0273586723053'

<AxesSubplot:>


# GRAPHING PREDICTED VERSUS ACTUAL SALARIES
features = ["GP", "Salary"]
X_dict = nhlMod[features].to_dict(orient="records")
X_train_dict = nhlMod[features].to_dict(orient="records")

y = nhlMod["PStat"]
y = y.astype({"PStat": "int"})
y_train = nhl["PStat"]
y_train = y_train.astype({"PStat": "int"})

# Dummy encoding
vec = DictVectorizer(sparse=False)
vec.fit(X_train_dict)
X_train = vec.transform(X_train_dict)

# Standardization
scaler = StandardScaler()
scaler.fit(X_train)
X_train_sc = scaler.transform(X_train)

res = []
for index, row in nflMod.iterrows():
    X_new_dict = row.to_dict() 
    X_new = vec.transform(X_new_dict)
    X_new_sc = scaler.transform(X_new)

    # K-Nearest Neighbors Model
    model = KNeighborsRegressor(n_neighbors=10)
    model.fit(X_train_sc, y_train)
    newPred = model.predict(X_new_sc).round()
    res.append(newPred)

nhlCompare = pd.DataFrame(nhl.PStat, columns=["PStat"])
preds = pd.DataFrame(res, columns=["Predictions"])
nhlCompare = pd.concat([nhlCompare, preds], axis=1)
nhlCompare.plot.line(figsize=(30,20))

<AxesSubplot:>


# GRAPHING TEST ERROR

# Initialize variables
features = ["Salary"]
X_dict = nhlMod[features].to_dict(orient="records")
X_train_dict = nhlMod[features].to_dict(orient="records")
X_new_dict = nhlMod[features].iloc[0].to_dict()

y = nhl["PStat"]
y_train = nhl["PStat"]
y_train = y_train.astype({"PStat": "int"})
y = y.astype({"PStat": "int"})

# Dummy encoding
vec = DictVectorizer(sparse=False)
vec.fit(X_train_dict)
X_train = vec.transform(X_train_dict)
X_new = vec.transform(X_new_dict)

# Standardization
scaler = StandardScaler()
scaler.fit(X_train)
X_train_sc = scaler.transform(X_train)
X_new_sc = scaler.transform(X_new)

# K-Nearest Neighbors Model
model = KNeighborsRegressor(n_neighbors=30)
model.fit(X_train_sc, y_train)
display("The model predicts that for the player at index zero, the player will make: %s" % model.predict(X_new_sc).round()[0])

#Compare results
nhlVerify = nhl.iloc[0].PStat
display("The actual salary of the player at index zero: %s" % nhlVerify)

#Calculates estimate of test error based on 10-fold cross validation
def get_cv_error(k):
    model = KNeighborsRegressor(n_neighbors=k)
    pipeline = Pipeline([("vectorizer", vec), ("scaler", scaler), ("fit", model)])
    result = (-(cross_val_score(pipeline, X_dict, y, cv=10, scoring="neg_mean_absolute_error"))).mean()
    return result
    
ks = pd.Series(range(1, 20))
ks.index = range(1, 20)
test_errs = ks.apply(get_cv_error)
test_errs.plot.line()

'The model predicts that for the player at index zero, the player will make: 1.0'

'The actual salary of the player at index zero: -4.0'

<AxesSubplot:>


# GRAPHING PREDICTED VERSUS ACTUAL SALARIES

# Initialize variables
features = ["Games Played", "Salary"]
X_dict = nflMod[features].to_dict(orient="records")
X_train_dict = nflMod[features].to_dict(orient="records")

y = nfl["PStat"]
y = y.astype({"PStat": "int"})
y_train = nfl["PStat"]
y_train = y_train.astype({"PStat": "int"})

# Dummy encoding
vec = DictVectorizer(sparse=False)
vec.fit(X_train_dict)
X_train = vec.transform(X_train_dict)

# Standardization
scaler = StandardScaler()
scaler.fit(X_train)
X_train_sc = scaler.transform(X_train)

res = []
for index, row in nflMod.iterrows():
    X_new_dict = row.to_dict() 
    X_new = vec.transform(X_new_dict)
    X_new_sc = scaler.transform(X_new)

    # K-Nearest Neighbors Model
    model = KNeighborsRegressor(n_neighbors=10)
    model.fit(X_train_sc, y_train)
    newPred = model.predict(X_new_sc).round()
    res.append(newPred)

nflCompare = pd.DataFrame(nfl.PStat, columns=["PStat"])
preds = pd.DataFrame(res, columns=["Predictions"])
nflCompare = pd.concat([nflCompare, preds], axis=1)
nflCompare.plot.line(figsize=(30,20))

<AxesSubplot:>


# GRAPHING MSE

# Initialize variables
nflModel = nfl.PStat
features = ["Salary"]

X_dict = nfl[features].to_dict(orient="records")
X_train_dict = nfl[features].to_dict(orient="records")
X_new_dict = nflMod[features].iloc[0].to_dict()

y = nfl["PStat"]
y = y.astype({"PStat": "int"})
y_train = nfl["PStat"]
y_train = y_train.astype({"PStat": "int"})

# Dummy encoding
vec = DictVectorizer(sparse=False)
vec.fit(X_train_dict)
X_train = vec.transform(X_train_dict)
X_new = vec.transform(X_new_dict)

# Standardization
scaler = StandardScaler()
scaler.fit(X_train)
X_train_sc = scaler.transform(X_train)
X_new_sc = scaler.transform(X_new)

# # K-Nearest Neighbors Model
model = KNeighborsRegressor(n_neighbors=30)
model.fit(X_train_sc, y_train)
display("The model predicts that for the player at index zero, the player will make: %s" % model.predict(X_new_sc).round()[0])

#Compare results
nflVerify = nfl.iloc[0].PStat
display("The actual salary of the player at index zero: %s" % nhlVerify)

#Calculates estimate of test error based on 10-fold cross validation
def get_cv_error(k):
    model = KNeighborsRegressor(n_neighbors=k)
    pipeline = Pipeline([("vectorizer", vec), ("scaler", scaler), ("fit", model)])
    result = (-(cross_val_score(pipeline, X_dict, y, cv=10, scoring="neg_mean_absolute_error"))).mean()
    return result
    
ks = pd.Series(range(1, 20))
ks.index = range(1, 20)
test_errs = ks.apply(get_cv_error)
test_errs.plot.line()

'The model predicts that for the player at index zero, the player will make: 2.0'

'The actual salary of the player at index zero: -4.0'

<AxesSubplot:>


# GRAPHING PREDICTED VERSUS ACTUAL SALARIES

# Initialize variables
features = ["G", "Salary"]
X_dict = nbaMod[features].to_dict(orient="records")
X_train_dict = nbaMod[features].to_dict(orient="records")

y = nba["PStat"]
y = y.astype({"PStat": "int"})
y_train = nba["PStat"]
y_train = y_train.astype({"PStat": "int"})

# Dummy encoding
vec = DictVectorizer(sparse=False)
vec.fit(X_train_dict)
X_train = vec.transform(X_train_dict)

# Standardization
scaler = StandardScaler()
scaler.fit(X_train)
X_train_sc = scaler.transform(X_train)

res = []
for index, row in nbaMod.iterrows():
    X_new_dict = row.to_dict() 
    X_new = vec.transform(X_new_dict)
    X_new_sc = scaler.transform(X_new)

    # K-Nearest Neighbors Model
    model = KNeighborsRegressor(n_neighbors=10)
    model.fit(X_train_sc, y_train)
    newPred = model.predict(X_new_sc).round()
    res.append(newPred)

nbaCompare = pd.DataFrame(nba.PStat, columns=["PStat"])
preds = pd.DataFrame(res, columns=["Predictions"])
nbaCompare = pd.concat([nbaCompare, preds], axis=1)
nbaCompare.plot.line(figsize=(30,20))

<AxesSubplot:>


# GRAPHING MSE

# Initalize variables
nbaModel = nba.PStat
features = ["Salary"]

X_dict = nba[features].to_dict(orient="records")
X_train_dict = nba[features].to_dict(orient="records")
X_new_dict = nbaMod[features].iloc[0].to_dict()

y = nba["PStat"]
y = y.astype({"PStat": "int"})
y_train = nba["PStat"]
y_train = y_train.astype({"PStat": "int"})

# Dummy encoding
vec = DictVectorizer(sparse=False)
vec.fit(X_train_dict)
X_train = vec.transform(X_train_dict)
X_new = vec.transform(X_new_dict)

# Standardization
scaler = StandardScaler()
scaler.fit(X_train)
X_train_sc = scaler.transform(X_train)
X_new_sc = scaler.transform(X_new)

# # K-Nearest Neighbors Model
model = KNeighborsRegressor(n_neighbors=30)
model.fit(X_train_sc, y_train)
display("The model predicts that the player at index zero, the player will make: %s" % model.predict(X_new_sc).round()[0])

#Compare results
nbaVerify = nba.iloc[0].PStat
display("The actual salary of the player at index zero: %s" % nbaVerify)

#Calculates estimate of test error based on 10-fold cross validation
def get_cv_error(k):
    model = KNeighborsRegressor(n_neighbors=k)
    pipeline = Pipeline([("vectorizer", vec), ("scaler", scaler), ("fit", model)])
    result = (-(cross_val_score(pipeline, X_dict, y, cv=10, scoring="neg_mean_absolute_error"))).mean()
    return result
    
ks = pd.Series(range(1, 20))
ks.index = range(1, 20)
test_errs = ks.apply(get_cv_error)
test_errs.plot.line()

'The model predicts that the player at index zero, the player will make: -1.0'

'The actual salary of the player at index zero: 63.0273586723053'

<AxesSubplot:>

	Year	Name	Age	Team	Pos	GP	G	GPG	A	PTS	...	SHA	S	S%	TOI	ATOI	BLK	HIT	FOwin	FOloss	FO%
0	2018	Justin Abdelkader	30.0	DET	LW	75	13	0.1733	22	35	...	-	110	12	1,241	16:33	40.0	174.0	47.0	50.0	48.5
1	2018	Pontus Aberg	24.0	TOT	LW	53	4	0.0755	12	16	...	-	70	6	645	12:10	8.0	24.0	4.0	8.0	33.3
2	2018	Noel Acciari	26.0	BOS	C	60	10	0.1667	1	11	...	-	66	15	775	12:55	41.0	152.0	42.0	51.0	45.2
3	2018	Kenny Agostino	25.0	BOS	LW	5	-	0.0000	1	1	...	-	11	-	60	12:03	1.0	4.0	0.0	1.0	0.0
4	2018	Sebastian Aho	20.0	CAR	RW	78	29	0.3718	36	65	...	-	200	15	1,398	17:55	17.0	65.0	78.0	94.0	45.3
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
15807	2000	Teemu Selanne	29.0	MDA	RW	79	33	0.4177	52	85	...	-	236	14	1,795	22:44	NaN	NaN	NaN	NaN	NaN
15808	2000	Paul Kariya	25.0	MDA	LW	74	42	0.5676	44	86	...	-	324	13	1,803	0:22	NaN	NaN	NaN	NaN	NaN
15809	2000	Mark Recchi	31.0	PHI	RW	82	28	0.3415	63	91	...	1	223	13	1,781	21:43	NaN	NaN	NaN	NaN	NaN
15810	2000	Pavel Bure*	28.0	FLA	RW	74	58	0.7838	36	94	...	-	360	16	1,804	0:23	NaN	NaN	NaN	NaN	NaN
15811	2000	Jaromir Jagr	27.0	PIT	RW	63	42	0.6667	54	96	...	-	290	15	1,462	23:12	NaN	NaN	NaN	NaN	NaN

	Year	Name	Age	Team	GP	GS	W	L	T/O	GA	...	QS	QS%	RBS	GA%-	GSAA	G	A	PTS	PIM	Pos
2720	2000	Jean-Sebastien Aubin	22	PIT	51	NaN	23.0	21.0	3.0	120	...	NaN	NaN	NaN	90.0	13.09	0	1.0	1.0	2.0	G
2721	2000	Tom Barrasso	34	PIT	18	NaN	5.0	7.0	2.0	46	...	NaN	NaN	NaN	125.0	-9.10	0	0.0	0.0	6.0	G
2722	2000	Tom Barrasso	34	OTT	7	NaN	3.0	4.0	0.0	22	...	NaN	NaN	NaN	NaN	NaN	0	0.0	0.0	0.0	G
2723	2000	Ed Belfour	34	DAL	62	NaN	32.0	21.0	7.0	127	...	NaN	NaN	NaN	85.0	23.20	0	3.0	3.0	10.0	G
2724	2000	Zac Bierk	23	TBL	12	NaN	4.0	4.0	1.0	31	...	NaN	NaN	NaN	NaN	NaN	0	1.0	1.0	0.0	G
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
4420	2018	Semyon Varlamov	29	COL	51	47.0	24.0	16.0	6.0	128	...	28.0	0.596	5.0	91.0	11.99	0	0.0	0.0	2.0	G
4421	2018	Andrei Vasilevskiy	23	TBL	65	64.0	44.0	17.0	3.0	167	...	36.0	0.563	8.0	92.0	15.12	0	2.0	2.0	14.0	G
4422	2018	Cam Ward	33	CAR	43	42.0	23.0	14.0	4.0	112	...	21.0	0.500	10.0	108.0	-7.82	0	0.0	0.0	14.0	G
4423	2018	Scott Wedgewood	25	ARI	20	17.0	5.0	9.0	4.0	63	...	7.0	0.412	4.0	122.0	-11.48	0	0.0	0.0	2.0	G
4424	2018	Adam Wilcox	25	BUF	1	0.0	0.0	1.0	0.0	0	...	0.0	NaN	0.0	NaN	NaN	0	0.0	0.0	0.0	G

	Name
0	Justin Abdelkader
1	Pontus Aberg
2	Noel Acciari
3	Kenny Agostino
4	Sebastian Aho
...	...
17456	Adin Hill
17471	Maxime Lagace
17478	Alex Lyon
17504	Harri Sateri
17516	Adam Wilcox

	Name	Year	Salary
2	Peyton Manning	2000	11066000
28299	Jake Plummer	2000	3030000
26124	John Randle	2000	5000000
26111	Levon Kirkland	2000	4525000
28316	Shannon Sharpe	2000	5000000
...	...	...	...
23973	Kolton Miller	2019	1092332
16767	Bud Dupree	2019	9232000
23981	Tremaine Edmunds	2019	1055434
24057	Terrell Edmunds	2019	966263
20840	Wil Lutz	2019	6100000

	Season	Name	Pos	Age	Team	G	GS	MP	PER	TS%	...	ORB	DRB	TRB	AST	STL	BLK	TOV	PF	PTS	Salary
3	1999	Shareef Abdur-Rahim	SF	23.0	Vancouver Grizzlies	82.0	82.0	3223.0	20.2	0.547	...	218.0	607.0	825.0	271.0	89.0	87.0	249.0	244.0	1663.0	9000000.0
5	1999	Ray Allen	SG	24.0	Milwaukee Bucks	82.0	82.0	3070.0	20.6	0.570	...	83.0	276.0	359.0	308.0	110.0	19.0	183.0	187.0	1809.0	9000000.0
6	1999	Rafer Alston	PG	23.0	Milwaukee Bucks	27.0	0.0	361.0	4.3	0.310	...	5.0	18.0	23.0	70.0	12.0	0.0	29.0	29.0	60.0	301000.0
9	1999	Kenny Anderson	PG	29.0	Boston Celtics	82.0	82.0	2593.0	17.4	0.524	...	55.0	170.0	225.0	420.0	139.0	8.0	130.0	230.0	1149.0	6680000.0
11	1999	Shandon Anderson	SF	26.0	Houston Rockets	82.0	82.0	2700.0	13.8	0.567	...	91.0	293.0	384.0	239.0	96.0	32.0	194.0	182.0	1009.0	2000000.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
10207	2016	Cody Zeller	PF	24.0	Charlotte Hornets	62.0	58.0	1725.0	16.7	0.604	...	135.0	270.0	405.0	99.0	62.0	58.0	65.0	189.0	639.0	5318313.0
10208	2016	Tyler Zeller	C	27.0	Boston Celtics	51.0	5.0	525.0	13.0	0.508	...	43.0	81.0	124.0	42.0	7.0	21.0	20.0	61.0	178.0	8000000.0
10209	2016	Stephen Zimmerman	C	20.0	Orlando Magic	19.0	0.0	108.0	7.3	0.346	...	11.0	24.0	35.0	4.0	2.0	5.0	3.0	17.0	23.0	950000.0
10210	2016	Paul Zipser	SF	22.0	Chicago Bulls	44.0	18.0	843.0	6.9	0.503	...	15.0	110.0	125.0	36.0	15.0	16.0	40.0	78.0	240.0	750000.0
10211	2016	Ivica Zubac	C	19.0	Los Angeles Lakers	38.0	11.0	609.0	17.0	0.547	...	41.0	118.0	159.0	30.0	14.0	33.0	30.0	66.0	284.0	1034956.0

	Year	Name	Age	Team	Pos	GP	G	GPG	A	PTS	...	SV%	GAA	SO	GPS	MIN	QS	QS%	RBS	GA%-	GSAA
15397	2000	Aaron Gavey	25.0	DAL	C	41	7	0.1707	6	13	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
15297	2000	Aaron Miller	28.0	COL	D	53	1	0.0189	7	8	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
15196	2000	Aaron Ward	27.0	DET	D	36	1	0.0278	3	4	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
15288	2000	Adam Burt	31.0	PHI	D	67	1	0.0149	6	7	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
15705	2000	Adam Deadmarsh	24.0	COL	RW	71	18	0.2535	27	45	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
422	2018	Zack Kassian	27.0	EDM	RW	74	7	0.0946	12	19	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
562	2018	Zack Mitchell	25.0	MIN	RW	23	3	0.1304	2	5	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
772	2018	Zack Smith	29.0	OTT	LW	68	5	0.0735	14	19	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
134	2018	Zdeno Chara	40.0	BOS	D	73	7	0.0959	17	24	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
294	2018	Zemgus Girgensons	24.0	BUF	C	71	7	0.0986	8	15	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	Name	Position	Year	Team	Games Played	Completion Percentage	Pass Attempts Per Game	Passing Yards	Passing Yards Per Game	TD Passes	...	Punting Touchbacks	Punts Returned	Yards Returned on Punts	TDs Returned on Punt	Total Fumbles	Total Touchbacks	Total TD Returns	Total Returns	Total Yards Returned	Total TDs
0	Shane Lechler	P	2001	Oakland Raiders	16	57.173042	0.0	1687.608618	138.741293	10.583483	...	12	34	502	0	2.0	32.0	0.0	0.0	0.0	0.0
1	Drew Brees	QB	2001	San Diego Chargers	1	55.6	27.0	221	221.0	1	...	NaN	NaN	NaN	NaN	2.0	0.0	0.0	0.0	0.0	2.0
2	Steve Smith	WR	2001	Carolina Panthers	15	57.173042	0.0	1687.608618	138.741293	10.583483	...	NaN	NaN	NaN	NaN	16.0	0.0	3.0	90.0	1795.0	1.0
3	Tom Brady	QB	2001	New England Patriots	15	63.9	27.5	2843	189.5	18	...	5.140187	29.228972	261.028037	0.308411	15.0	5.0	0.0	0.0	0.0	0.0
4	Adam Vinatieri	K	2001	New England Patriots	16	57.173042	0.0	1687.608618	138.741293	10.583483	...	0	0	0	0	0.0	6.0	0.0	0.0	0.0	0.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2803	Arrelious Benn	WR	2016	Jacksonville Jaguars	15	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	4.0	0.0	0.0	2.0	16.0	3.0
2802	Ben Braunecker	TE	2016	Chicago Bears	13	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	0.0	0.0	0.0	0.0	0.0	0.0
2801	Davante Adams	WR	2016	Green Bay Packers	16	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	4.0	0.0	0.0	0.0	0.0	12.0
2772	Roosevelt Nix-Jones	FB	2016	Pittsburgh Steelers	10	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	0.0	0.0	0.0	0.0	0.0	0.0
7167	Jacob Schum	P	2016	Green Bay Packers	16	NaN	NaN	NaN	NaN	NaN	...	4	16	151	0	0.0	4.0	0.0	0.0	0.0	0.0

		Salary
Year	Position
2000	C	1165438.0
	C/LW	1025000.0
	D	1200082.0
	LW	1162188.0
	RW	1460204.0
...	...	...
2018	LW/RW	8200000.0
	RW	2710574.0
	RW/C	1250000.0
	RW/LW	875000.0
	G	2805114.0

		Salary
Year	Position
2001	K	558833.0
	P	298000.0
	QB	1246500.0
	WR	709000.0
2002	DE	6904353.0
...	...	...
2016	RB	1464110.0
	SAF	291860.0
	SS	2422871.0
	TE	2038728.0
	WR	2307403.0

		Salary
Year	Position
1999	C	4488677.0
	PF	5042781.0
	PG	3735148.0
	SF	2703636.0
	SG	3513292.0
...	...	...
2016	PF	5625808.0
	PF-C	1709719.0
	PG	5000585.0
	SF	5624772.0
	SG	4791897.0

		Salary
Year	Team
2000	ATL	680882.0
	BOS	1118828.0
	BUF	981206.0
	CAR	1131776.0
	CGY	933316.0
...	...	...
2018	TOR	2216667.0
	VAN	2175000.0
	VEG	1680000.0
	WPG	2983333.0
	WSH	4175000.0

		Salary
Year	Team
2001	Carolina Panthers	709000.0
	Cleveland Browns	389000.0
	New England Patriots	489000.0
	Oakland Raiders	467750.0
	San Diego Chargers	2165000.0
...	...	...
2016	San Francisco 49ers	2133990.0
	Seattle Seahawks	2727249.0
	Tampa Bay Buccaneers	2401233.0
	Tennessee Titans	2236988.0
	Washington Redskins	2411496.0

CMPS 3160 Final Project¶

Modeling player stats and salaries by season/year for NHL, NBA, and NFL¶

By: Ryan Stevens, Alex Olteanu¶

Github.io: https://stevensryanw.github.io¶

GitHub: https://github.com/stevensryanw/stevensryanw.github.io¶

Introduction¶

Data Initialization¶

NHL Data Initializations¶

NHL Skater and Goalie Stats Initialization with some cleaning¶

Creating a dataframe with just the skater and goalie names in order to webscrape for salary data¶

Output datasets to csv files¶

NHL Salary Web Scraper¶

NHL Salary Data Initialization, Cleaning, and Merging¶

NHL Data Description¶

NFL Data Initializations¶

NFL Stats Initializations¶

NFL Stats Cleaning¶

Removing unwanted stats from each dataframe individually¶

Merging of all the NFL data with some cleaning¶

Generating and appending additional columns to braoder NFL df¶

Merging subset dfs (now a singular 'total' column) into the broader NFL df¶

NFL Salary Web Scraper¶

Salary Data Inizialization and Cleaning¶

Merging the NFL Stats and Salary Data¶

NFL Data Description¶

All¶

NBA Data Initializations¶

Creating the dataframe¶

NBA Data Description¶

Replacing missing data with the aveages¶

NHL¶

NFL¶

NBA¶

Outputting all finished to CSV¶

Exploratory Data Analysis¶

Graphs¶

Graph of average salary each year per league¶

Comments on the graphs¶

Graph of average salary by position each year per league¶

Separate graphs for each league¶

Comments on the graphs¶

Comments on the graphs¶

Comments on the graphs¶

Graph of average salary by team each year per league¶

Separate graphs for each league¶

Comments on the graphs¶

Comments on the graphs¶

Comments on the graphs¶

Graph of average points per year by position for NBA¶

Comments on the graphs¶

Graph of average goals per year by position for NHL¶

Comments on the graphs¶

Graph of average total touch downs per year by position for NFL¶

Comments on the graphs¶

Correlation Matrices¶

NHL¶

NFL¶

NBA¶

Scatter Plots Matrices¶

NHL¶

NFL¶

NBA¶

Data Analysis Using Machine Learning¶

Question 1: Can we use all player stats in each league to predict salary?¶

NHL¶

NFL¶

NBA¶

Question 2: Can we create a PSTAT perfomanace metric for each league? And can we use it to predict salary?¶

NHL¶

NFL¶

NBA¶

Question 3: Can we use salary to predict the PSTAT performance of a player in the NHL, NBA, and NFL?¶

NHL¶

NFL¶

NBA¶

Conclusion¶

EDA¶

Machine Learning¶

Github.io: https://stevensryanw.github.io ¶

GitHub: https://github.com/stevensryanw/stevensryanw.github.io ¶