14. Data Analysis and More Data Visualization with Python I

14. Data Analysis and More Data Visualization with Python I#

Please refer to here for the data in this lecture.

Last time#

Python package: NumPy
Python package: Matplotlib

Today#

Python package: Pandas

Python package: `pandas`#

pandas is a Python package providing fast, flexible, and expressive data structures designed to make working with relational or labeled data both easy and intuitive.
Reference:
1. Pandas documentation
2. Pandas tutorials

Install `pandas`#

Activate your virtual environment
Install pandas
```
pip install pandas
```

DataFrame and Series#

A DataFrame is a 2-dimensional data structure that can store data of different types (including characters, integers, floating point values, categorical data and more) in columns.
Each column in a DataFrame is a Series.

import pandas as pd

df1 = pd.DataFrame({
    "Student ID": ["112514001", "112514002", "112514003", "112514004", "112514005"],
    "Gender": ["Female", "Male", "Third", "Female", "Male"],
    "HW#1": [100, 0, 70, 90, 80],
    "HW#2": [90, 60, 95, 85, 100],
    "HW#3": [100, 75, 80, 100, 60],
})

print(df1)

  Student ID  Gender  HW#1  HW#2  HW#3
112514001  Female   100    90   100
112514002    Male     0    60    75
112514003   Third    70    95    80
112514004  Female    90    85   100
112514005    Male    80   100    60

print(df1["HW#1"])

  100
    0
   70
   90
   80
Name: HW#1, dtype: int64

print(df1.iloc[0])
print("type(df1.iloc[0]):", type(df1.iloc[0]))

Student ID    112514001
Gender           Female
HW#1                100
HW#2                 90
HW#3                100
Name: 0, dtype: object
type(df1.iloc[0]): <class 'pandas.core.series.Series'>

import numpy as np

hw4 = pd.Series(np.random.randint(0, 100, size=(5,)), name="HW#4")

print(hw4)

  99
  69
   1
  83
  72
Name: HW#4, dtype: int32

# Add new column
df1["HW#4"] = hw4

print(df1)

  Student ID  Gender  HW#1  HW#2  HW#3  HW#4
112514001  Female   100    90   100    99
112514002    Male     0    60    75    69
112514003   Third    70    95    80     1
112514004  Female    90    85   100    83
112514005    Male    80   100    60    72

# Add new column from existing columns
df1["Final"] = df1["HW#1"]*0.3 + df1["HW#2"]*0.3 + df1["HW#3"]*0.3 + df1["HW#4"]*0.1

print(df1)

  Student ID  Gender  HW#1  HW#2  HW#3  HW#4  Final
112514001  Female   100    90   100    99   96.9
112514002    Male     0    60    75    69   47.4
112514003   Third    70    95    80     1   73.6
112514004  Female    90    85   100    83   90.8
112514005    Male    80   100    60    72   79.2

Using pd.DataFrame.describe() to acquire the basic statistics

# Only for the numerical data
df1.describe()

	HW#1	HW#2	HW#3	HW#4	Final
count	5.000000	5.000000	5.000000	5.00000	5.000000
mean	68.000000	86.000000	83.000000	64.80000	77.580000
std	39.623226	15.572412	17.175564	37.55263	19.218012
min	0.000000	60.000000	60.000000	1.00000	47.400000
25%	70.000000	85.000000	75.000000	69.00000	73.600000
50%	80.000000	90.000000	80.000000	72.00000	79.200000
75%	90.000000	95.000000	100.000000	83.00000	90.800000
max	100.000000	100.000000	100.000000	99.00000	96.900000

# Get dtype of each column
print(df1.dtypes)

# Get the shape of the DataFrame
print(df1.shape)

Student ID     object
Gender         object
HW#1            int64
HW#2            int64
HW#3            int64
HW#4            int32
Final         float64
dtype: object
(5, 7)

# data_range

num = 10
dates = pd.date_range("2050-12-25", periods=5, freq="W-THU")
idx = ["1395140{:02d}".format(i+1) for i in range(num)]

df2 = pd.DataFrame(np.random.randint(0, 3, (num, len(dates))), index=idx, columns=dates)

print(df2)
print(df2.index)
print(df2.columns)

           2050-12-29  2051-01-05  2051-01-12  2051-01-19  2051-01-26
139514001           0           1           0           1           0
139514002           2           2           1           2           0
139514003           1           2           0           0           1
139514004           1           1           0           0           0
139514005           0           2           2           1           2
139514006           0           2           2           1           0
139514007           0           2           0           1           1
139514008           1           0           2           0           2
139514009           0           1           2           0           0
139514010           1           2           2           1           2
Index(['139514001', '139514002', '139514003', '139514004', '139514005',
       '139514006', '139514007', '139514008', '139514009', '139514010'],
      dtype='object')
DatetimeIndex(['2050-12-29', '2051-01-05', '2051-01-12', '2051-01-19',
               '2051-01-26'],
              dtype='datetime64[ns]', freq='W-THU')

Read and write tabular data#

Use DataFrame.to_* for exporting data
1. to_csv
2. to_excel
3. …
Use pd.read_* for reading data
1. read_csv
2. read_excel
3. …
Note. If you want to read excel file via pandas, you need to install openpyxl by pip install openpyxl first.

# exporting df1 as a .csv file

# df1.to_csv("test.csv")

# importing data from a .csv file
batting = pd.read_csv(".//data//data_batting.csv")

print(batting)

        playerID  yearID  stint teamID lgID    G   AB   R   H  2B  ...  RBI  \
     abadan01    2001      1    OAK   AL    1    1   0   0   0  ...    0   
    abbotje01    2001      1    FLO   NL   28   42   5  11   3  ...    5   
    abbotku01    2001      1    ATL   NL    6    9   0   2   0  ...    0   
    abbotpa01    2001      1    SEA   AL   28    4   0   1   0  ...    0   
    abernbr01    2001      1    TBA   AL   79  304  43  82  17  ...   33   
...          ...     ...    ...    ...  ...  ...  ...  ..  ..  ..  ...  ...   
 zitoba01    2015      1    OAK   AL    3    0   0   0   0  ...    0   
zobribe01    2015      1    OAK   AL   67  235  39  63  20  ...   33   
zobribe01    2015      2    KCA   AL   59  232  37  66  16  ...   23   
zuninmi01    2015      1    SEA   AL  112  350  28  61  11  ...   28   
 zychto01    2015      1    SEA   AL   13    0   0   0   0  ...    0   

       SB  CS  BB   SO  IBB  HBP  SH  SF  GIDP  
     0   0   0    0    0    0   0   0     0  
     0   0   3    7    0    1   0   0     1  
     1   0   0    3    0    0   0   0     0  
     0   0   0    1    0    0   1   0     0  
     8   3  27   35    1    0   3   1     3  
...    ..  ..  ..  ...  ...  ...  ..  ..   ...  
 0   0   0    0    0    0   0   0     0  
 1   1  33   26    2    0   0   3     5  
 2   3  29   30    1    1   0   2     3  
 0   1  21  132    0    5   8   2     6  
 0   0   0    0    0    0   0   0     0  

[20700 rows x 22 columns]

# Only show the first 5 rows
print(batting.head())

# Change n if you need
# print(batting.head(n=7))

    playerID  yearID  stint teamID lgID   G   AB   R   H  2B  ...  RBI  SB  \
 abadan01    2001      1    OAK   AL   1    1   0   0   0  ...    0   0   
abbotje01    2001      1    FLO   NL  28   42   5  11   3  ...    5   0   
abbotku01    2001      1    ATL   NL   6    9   0   2   0  ...    0   1   
abbotpa01    2001      1    SEA   AL  28    4   0   1   0  ...    0   0   
abernbr01    2001      1    TBA   AL  79  304  43  82  17  ...   33   8   

   CS  BB  SO  IBB  HBP  SH  SF  GIDP  
 0   0   0    0    0   0   0     0  
 0   3   7    0    1   0   0     1  
 0   0   3    0    0   0   0     0  
 0   0   1    0    0   1   0     0  
 3  27  35    1    0   3   1     3  

[5 rows x 22 columns]

Filter a subset of a `DataFrame`#

Example: Extracting the batting statistics of Albert Pujols

# method 1
tmp_list = []

for i in range(batting.shape[0]):
    if batting.iloc[i]["playerID"] == "pujolal01":
        tmp_list.append(batting.iloc[i])

method1 = pd.DataFrame(tmp_list, columns=batting.columns)

print(method1)

        playerID  yearID  stint teamID lgID    G   AB    R    H  2B  ...  RBI  \
  pujolal01    2001      1    SLN   NL  161  590  112  194  47  ...  130   
 pujolal01    2002      1    SLN   NL  157  590  118  185  40  ...  127   
 pujolal01    2003      1    SLN   NL  157  591  137  212  51  ...  124   
 pujolal01    2004      1    SLN   NL  154  592  133  196  51  ...  123   
 pujolal01    2005      1    SLN   NL  161  591  129  195  38  ...  117   
 pujolal01    2006      1    SLN   NL  143  535  119  177  33  ...  137   
 pujolal01    2007      1    SLN   NL  158  565   99  185  38  ...  103   
pujolal01    2008      1    SLN   NL  148  524  100  187  44  ...  116   
pujolal01    2009      1    SLN   NL  160  568  124  186  45  ...  135   
pujolal01    2010      1    SLN   NL  159  587  115  183  39  ...  118   
pujolal01    2011      1    SLN   NL  147  579  105  173  29  ...   99   
pujolal01    2012      1    LAA   AL  154  607   85  173  50  ...  105   
pujolal01    2013      1    LAA   AL   99  391   49  101  19  ...   64   
pujolal01    2014      1    LAA   AL  159  633   89  172  37  ...  105   
pujolal01    2015      1    LAA   AL  157  602   85  147  22  ...   95   

       SB  CS   BB  SO  IBB  HBP  SH  SF  GIDP  
   1   3   69  93    6    9   1   7    21  
  2   4   72  69   13    9   0   4    20  
  5   1   79  65   12   10   0   5    13  
  5   5   84  52   12    7   0   9    21  
 16   2   97  65   27    9   0   3    19  
  7   2   92  50   28    4   0   3    20  
  2   6   99  58   22    7   0   8    27  
 7   3  104  54   34    5   0   8    16  
16   4  115  64   44    9   0   8    23  
14   4  103  76   38    4   0   6    23  
 9   1   61  58   15    4   0   7    29  
 8   1   52  76   16    5   0   6    19  
 1   1   40  55    8    5   0   7    18  
 5   1   48  71   11    5   0   9    28  
 5   3   50  72   10    6   0   3    15  

[15 rows x 22 columns]

# method 2
method2 = batting[batting["playerID"] == "pujolal01"]

print(method2)

        playerID  yearID  stint teamID lgID    G   AB    R    H  2B  ...  RBI  \
  pujolal01    2001      1    SLN   NL  161  590  112  194  47  ...  130   
 pujolal01    2002      1    SLN   NL  157  590  118  185  40  ...  127   
 pujolal01    2003      1    SLN   NL  157  591  137  212  51  ...  124   
 pujolal01    2004      1    SLN   NL  154  592  133  196  51  ...  123   
 pujolal01    2005      1    SLN   NL  161  591  129  195  38  ...  117   
 pujolal01    2006      1    SLN   NL  143  535  119  177  33  ...  137   
 pujolal01    2007      1    SLN   NL  158  565   99  185  38  ...  103   
pujolal01    2008      1    SLN   NL  148  524  100  187  44  ...  116   
pujolal01    2009      1    SLN   NL  160  568  124  186  45  ...  135   
pujolal01    2010      1    SLN   NL  159  587  115  183  39  ...  118   
pujolal01    2011      1    SLN   NL  147  579  105  173  29  ...   99   
pujolal01    2012      1    LAA   AL  154  607   85  173  50  ...  105   
pujolal01    2013      1    LAA   AL   99  391   49  101  19  ...   64   
pujolal01    2014      1    LAA   AL  159  633   89  172  37  ...  105   
pujolal01    2015      1    LAA   AL  157  602   85  147  22  ...   95   

       SB  CS   BB  SO  IBB  HBP  SH  SF  GIDP  
   1   3   69  93    6    9   1   7    21  
  2   4   72  69   13    9   0   4    20  
  5   1   79  65   12   10   0   5    13  
  5   5   84  52   12    7   0   9    21  
 16   2   97  65   27    9   0   3    19  
  7   2   92  50   28    4   0   3    20  
  2   6   99  58   22    7   0   8    27  
 7   3  104  54   34    5   0   8    16  
16   4  115  64   44    9   0   8    23  
14   4  103  76   38    4   0   6    23  
 9   1   61  58   15    4   0   7    29  
 8   1   52  76   16    5   0   6    19  
 1   1   40  55    8    5   0   7    18  
 5   1   48  71   11    5   0   9    28  
 5   3   50  72   10    6   0   3    15  

[15 rows x 22 columns]

Exercise 14.1: calculate the slugging percentage (SLG)#

Please write a program that calculates the slugging percentage (SLG) of Albert Pujols during his first career in St. Louis Cardinals (2001-2011).
Please also calculate the average SLG of the entire league during 2001-2011.
The definition of SLG is:

\[ \text{SLG} = \frac{1 \times (\text{H}) + 2 \times (\text{2B}) + 3 \times (\text{3B}) + 4 \times (\text{HR})}{\text{AB}} \]

Please plot the result:

Exercise 14.2: calculate the OPS+#

Please write a program that calculates the On-base Plus Slugging Plus (\(\text{OPS}^+\)) of all players that had played in MLB 2023 seasons.
The definitions of Batting Average (BA), On-based Percentage (OBP), On-based Plus Slugging (OPS), and \(\text{OPS}^+\) are:

\[ \begin{align}\begin{aligned} \begin{aligned}\\\begin{split}\text{BA} &= \frac{\text{H}}{\text{AB}}\\\end{split}\\\begin{split}\text{OBP} &= \frac{\text{H} + \text{BB} + \text{HBP}}{\text{AB}}\\\end{split}\\\begin{split}\text{SLG} &= \frac{1 \times (\text{H}) + 2 \times (\text{2B}) + 3 \times (\text{3B}) + 4 \times (\text{HR})}{\text{AB}}\\\end{split}\\\begin{split}\text{OPS} &= \text{OBP} + \text{SLG}\\\end{split}\\\begin{split}\text{OPS}^+ &= 100*[\frac{\text{player OBP}}{\text{league OBP}} + \frac{\text{player SLG}}{\text{league SLG}} - 1]\\\end{split}\\\end{aligned} \end{aligned}\end{align} \]

Your data is here.
Please plot the result:

Style 1

Style 2

Exercise 14.3: calculate the wOBA, wRAA, and wRC#

Please write a program that calculates the weighted on-base average (wOBA), weighted runs above average (wRAA), and weighted Runs Created (wRC) of all player that had played in MLB 2023 season.

Your data is here.

The definitions of wOBA, wRAA, and wRC of MLB 2023 are:

\[ \begin{align}\begin{aligned}\begin{split} \begin{aligned} \text{wOBA} &= \frac{0.696 \cdot \text{uBB} + 0.726 \cdot \text{HBP} + 0.883 \cdot \text{1B}+ 1.244 \cdot \text{2B} + 1.569 \cdot \text{3B} + 2.004 \cdot \text{HR}}{\text{AB} + \text{BB} - \text{IBB} + \text{SF} + \text{HBP}}\\\end{split}\\\begin{split}\text{uBB} &= \text{BB} - \text{IBB}\\\end{split}\\\begin{split}\text{wRC} &= (\frac{\text{wOBA} - \text{league wOBA}}{\text{wOBA scale}} + \text{league \(\frac {\text{R}}{\text{PA}}\)}) \times \text{PA}\\\end{split}\\\text{wRC} &= (\frac{\text{wOBA} - 0.318}{1.204} + 0.122) \times \text{PA}\\\end{aligned} \end{aligned}\end{align} \]

Please plot the result:

Style 1

Style 2

Reference#

The data of this lecture is from Baseball Reference, FanGraphs BaseBall, and Kaggle.com.

Don't click this

Exercise 14.1

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

batting = pd.read_csv(".//data//data_batting.csv")
pujols = batting[batting["playerID"] == "pujolal01"]

data = pujols[pujols["teamID"] == "SLN"][["playerID", "yearID", "teamID", "AB", "H", "2B", "3B", "HR"]]

data["SLG"] = (((data["H"]-data["2B"]-data["3B"]-data["HR"]) + 2*data["2B"] + 3*data["3B"] + 4*data["HR"])/data["AB"]).round(2)

slg_all = []

for i in range(data.shape[0]):
    data_all = batting[batting["yearID"] == (2001+i) ][["playerID", "yearID", "teamID", "AB", "H", "2B", "3B", "HR"]]
    data_all["SLG"] = (((data_all["H"]-data_all["2B"]-data_all["3B"]-data_all["HR"]) + \
        2*data_all["2B"] + 3*data_all["3B"] + 4*data_all["HR"])/data_all["AB"])

    slg_all.append(data_all["SLG"].mean())    

data["League SLG"] = slg_all

fig = plt.figure(1, figsize=(6,4), dpi=100)
ax = fig.add_subplot(111)
ax.plot(data["yearID"], data["SLG"], label="Albert Pujols")
ax.plot(data["yearID"], data["League SLG"], label="League SLG")
ax.set_title("Albert Pujols' Batting Status", fontweight='bold')
ax.set_xlabel("Year", fontweight='bold')
ax.set_ylabel("SLG", fontweight='bold')
ax.grid(True)
ax.legend()

plt.show()

Exercise 14.2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

batting2023 = pd.read_excel("./data/data_batting_2021-2023.xlsx", "2023")

# PA >= 100
batting2023 = batting2023[batting2023["PA"] >= 100]

# BA
batting2023["BA"] = batting2023["H"]/batting2023["AB"]

# OBP
batting2023["OBP"] = (batting2023["H"] + batting2023["BB"] + batting2023["HBP"])/batting2023["PA"]

# SLG
batting2023["SLG"] = (
    (batting2023["H"]-batting2023["2B"]-batting2023["3B"]-batting2023["HR"])*1 + \
    batting2023["2B"]*2 + batting2023["3B"]*3 + batting2023["HR"]*4
)/batting2023["AB"]
    
# OPS
batting2023["OPS"] = batting2023["OBP"] + batting2023["SLG"]

# OPS+
batting2023["OPS+"] = 100*((batting2023["SLG"]/batting2023.iloc[-1]["SLG"]) + (batting2023["OBP"]/batting2023.iloc[-1]["OBP"]) - 1)

# Extract data
df = batting2023.sort_values(by="OPS+", ascending=False)

players, ops_plus, ba = [], [], []

for i in range(5):
    players.append(df.iloc[i]["Name"])
    ops_plus.append(df.iloc[i]["OPS+"])
    ba.append(df.iloc[i]["BA"])

for i in range(15):
    pick = np.random.randint(5, df.shape[0])
    
    players.append(df.iloc[pick]["Name"])
    ops_plus.append(df.iloc[pick]["OPS+"])
    ba.append(df.iloc[pick]["BA"])

Exercise 14.2: Style 1

fig = plt.figure(1, figsize=(12,4), dpi=100, facecolor="w")

ax = fig.add_subplot(111)

ax.scatter(range(len(players)), ops_plus, c="r")

ax.plot(
    range(len(players)), 
    np.ones((len(players),))*100, 
    'r--', 
    label=r"League average OPS$^+$",
)

ax.set_ylabel(r"OPS$^+$", color='r')
ax.set_xticks(range(len(players)))
ax.set_xticklabels(players, fontsize=8)

for label in ax.get_xticklabels():
    label.set_rotation(40)
    label.set_horizontalalignment('center')

ax.tick_params(axis='x', which='both', direction='inout')
ax.tick_params(axis='y', labelcolor='r')

ax2 = ax.twinx()
ax2.scatter(range(len(players)), ba, c="b")

ax2.plot(
    range(len(players)), 
    np.ones((len(players),))*batting2023.iloc[-1]["BA"], 
    'b--', 
    label="Average BA = {:.03f}".format(batting2023.iloc[-1]["BA"]),
)

ax2.set_ylabel("BA", color='b')
ax2.tick_params(axis='y', labelcolor='b')

fig.suptitle(r"OPS$^+$ vs BA of players")
fig.legend(labelcolor='linecolor')
# fig.savefig("exercise2_1.png", bbox_inches='tight', facecolor='white')

plt.show()

Exercise 14.2: Style 2

from mpl_toolkits.axes_grid1 import host_subplot

fig2 = plt.figure(2, figsize=(12,4), dpi=100, facecolor="w")

host = host_subplot(111)
par = host.twinx()

host.scatter(range(len(players)), ops_plus, c="r")
par.scatter(range(len(players)), ba, c="b")

data1, = host.plot(
    range(len(players)), 
    np.ones((len(players),))*100, 
    'r--', 
    label=r"League average OPS$^+$",
)

data2, = par.plot(
    range(len(players)), 
    np.ones((len(players),))*batting2023.iloc[-1]["BA"], 
    'b--', 
    label="Average BA = {:.03f}".format(batting2023.iloc[-1]["BA"]),
)

host.set_ylabel(r"OPS$^+$", color='r')
host.set_xticks(range(len(players)))
host.set_xticklabels(players, fontsize=8)

for label in host.get_xticklabels():
    label.set_rotation(40)
    label.set_horizontalalignment('center')

host.tick_params(axis='x', which='both', direction='inout')
host.tick_params(axis='y', labelcolor='r')
host.legend(labelcolor="linecolor")

par.set_ylabel("BA", color='b')
par.tick_params(axis='y', labelcolor='b')

fig2.suptitle(r"OPS$^+$ vs BA of players")
# fig2.savefig("exercise2_2.png", bbox_inches='tight', facecolor='white')

plt.show()

Exercise 14.3

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

batting2023 = pd.read_excel("./data/data_batting_2021-2023.xlsx", "2023")

# PA >= 100
batting2023 = batting2023[batting2023["PA"] >= 100]

# wOBA
batting2023["wOBA"] = (
    0.696 * (batting2023["BB"] - batting2023["IBB"]) + \
    0.726 * batting2023["HBP"] + \
    0.883 * (batting2023["H"]-batting2023["2B"]-batting2023["3B"]-batting2023["HR"]) + \
    1.244 * batting2023["2B"] + \
    1.569 * batting2023["3B"] + \
    2.004 * batting2023["HR"]
) / (batting2023["AB"] + batting2023["BB"] + batting2023["SF"] + batting2023["HBP"] - batting2023["IBB"])

# wRC
batting2023["wRC"] = (
    (batting2023["wOBA"] - 0.318)/1.204 + 0.122
) * batting2023["PA"]

# Extract data
df = batting2023.sort_values(by="wRC", ascending=False)

players, wRCs, wOBAs = [], [], []

for i in range(5):
    players.append(df.iloc[i]["Name"])
    wRCs.append(df.iloc[i]["wRC"])
    wOBAs.append(df.iloc[i]["wOBA"])

for i in range(15):
    pick = np.random.randint(5, df.shape[0])
    
    players.append(df.iloc[pick]["Name"])
    wRCs.append(df.iloc[pick]["wRC"])
    wOBAs.append(df.iloc[pick]["wOBA"])