In this report, I will be analyzing a PGA Tour Dataset (2010-2018) to find correlation between variables like strokes gained, average distance, fairway accuracy, average score, points, wins, money, and more!
Describing Variables:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
pgadata = pd.read_csv('pgaTourData.csv')
pgadata.head()
#Replace NaNs in Top 10 and Wins with 0 and make them int
pgadata['Top 10'].fillna(0, inplace = True)
pgadata['Top 10'] = pgadata['Top 10'].astype(int)
pgadata['Wins'].fillna(0, inplace = True)
pgadata['Wins'] = pgadata['Wins'].astype(int)
#Drop rest of NaNs
pgadata.dropna(axis = 0, inplace = True)
#Change Rounds to int
pgadata['Rounds'] = pgadata['Rounds'].astype(int)
#Take away the comma in points and make it an int
pgadata['Points'] = pgadata['Points'].apply(lambda x: x.replace(',',''))
pgadata['Points'] = pgadata['Points'].astype(int)
#Take away $ and commas to change to int
pgadata['Money'] = pgadata['Money'].apply(lambda x: x.replace('$',''))
pgadata['Money'] = pgadata['Money'].apply(lambda x: x.replace(',',''))
pgadata['Money'] = pgadata['Money'].astype(float)
pgadata.head()
Strokes Gained in Golf measures how good a player is on certain shots.
In a correlation matrix, if a number is closer to 1 or -1 it has a strong correlation. And if it is closer to zero, it has a weaker correlation.
Looking at the final column of this matrix is where we get the most interesting data.
corrdata = pgadata[['Average SG Total', 'SG:OTT','SG:ARG','Average SG Putts', 'Average Score']]
corr = corrdata.corr()
corr.style.background_gradient(cmap='coolwarm')
fig = make_subplots(rows=1,cols=2,
subplot_titles=('Average Score vs. SG: Off the Tee', 'Average Score vs. SG: Putting'), #add subplot titles
y_title='Average Score') #add master y title
# Avg Score vs. SG:OTT scatter plot
fig.add_trace(
go.Scatter(x=pgadata['SG:OTT'],
y=pgadata['Average Score'],
mode='markers',
hovertext=pgadata['Player Name'],
showlegend=False,
name='Score v. SG:OTT'),
row=1, col=1)
# Avg Score vs. SG:Putt scatter plot
fig.add_trace(
go.Scatter(x=pgadata['Average SG Putts'],
y=pgadata['Average Score'],
mode='markers',
hovertext=pgadata['Player Name'],
showlegend=False,
name='Score v. SG:Putt'),
row=1, col=2)
#add x labels
fig['layout']['xaxis']['title']='Strokes Gained: Off the Tee'
fig['layout']['xaxis2']['title']='Strokes Gained: Putting'
fig.update_xaxes(
range=[-2,2])
fig.update_yaxes(
range=[68,75])
fig.show()
Red Zone: Players who gained shots off the tee and averaged over par
When looking at this graph, it is interesting to see how few players reside in the Yellow Zone. That goes to show that players who gain shots off the tee also typically have lower scores.
The player with the highest average score who still gained shots off the tee was Derek Ernst
fig.add_shape(type='rect',
xref='x1',
yref='y1',
x0=0, x1=2,
y0=68, y1=72,
fillcolor='green',
line_color='green',
opacity=0.1)
fig.add_shape(type='rect',
xref='x1',
yref='y1',
x0=-2, x1=0,
y0=68, y1=72,
fillcolor='blue',
line_color='blue',
opacity=0.1)
fig.add_shape(type='rect',
xref='x1',
yref='y1',
x0=0, x1=2,
y0=72, y1=75,
fillcolor='yellow',
line_color='yellow',
opacity=0.1)
fig.add_shape(type='rect',
xref='x1',
yref='y1',
x0=-2, x1=0,
y0=72, y1=75,
fillcolor='red',
line_color='red',
opacity=0.1)
fig.show()
sorted_df = pgadata.sort_values('Year',ascending=True)
fig = px.scatter(sorted_df, x='SG:OTT', y='Average Score', animation_frame='Year', hover_name='Player Name')
fig.layout.updatemenus[0].buttons[0].args[1]['frame']['duration'] = 1000
fig.layout.updatemenus[0].buttons[0].args[1]['transition']['duration'] = 1
fig.show()
corrdata = pgadata[['SG:OTT','Fairway Percentage', 'Avg Distance']]
corr = corrdata.corr()
corr.style.background_gradient(cmap='coolwarm')
fig = make_subplots(rows=1,cols=2,
subplot_titles=('Fairway % vs. Strokes Gained: Off the Tee', 'Average Distance vs. Strokes Gained: Off the Tee'), #add subplot titles
x_title='Strokes Gained: Off the Tee') #add master x title
# Fairway % vs. SG:OTT scatter plot
fig.add_trace(
go.Scatter(x=pgadata['SG:OTT'],
y=pgadata['Fairway Percentage'],
mode='markers',
hovertext=pgadata['Player Name'],
showlegend=False,
name='Fairway % v. SG:OTT'),
row=1, col=1)
# Avg Dist vs. SG:OTT scatter plot
fig.add_trace(
go.Scatter(x=pgadata['SG:OTT'],
y=pgadata['Avg Distance'],
mode='markers',
hovertext=pgadata['Player Name'],
showlegend=False,
name='SG:OTT vs. Avg Dist'),
row=1, col=2)
#add y labels
fig['layout']['yaxis']['title']='Fairway Percentage'
fig['layout']['yaxis2']['title']='Average Distance'
fig.update_xaxes(
range=[-2,2])
fig.show()