2 Tutorial 2

2.1 Topics

Summary statistics
Quantiles
Histograms
Encoding categorical varaibles

2.2 Important Python Packages

Pandas
Seaborn
Matplotlib

# import necessary packages
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

2.3 Today’s datasets

City Temperatures – Daily temperature for different international cities (download .csv here)

# read CSV
temp_df = pd.read_csv('city_temp.csv')

# examine the data -- in visual studio code we can also do this another way
temp_df

	Country	City	Month	Day	Year	AvgTemperature
0	Malawi	Lilongwe	1	1	1995	69.5
1	Malawi	Lilongwe	1	2	1995	69.5
2	Malawi	Lilongwe	1	3	1995	67.5
3	Malawi	Lilongwe	1	4	1995	68.5
4	Malawi	Lilongwe	1	5	1995	66.7
...	...	...	...	...	...	...
47392	US	Rochester	5	9	2020	33.9
47393	US	Rochester	5	10	2020	41.4
47394	US	Rochester	5	11	2020	40.7
47395	US	Rochester	5	12	2020	38.9
47396	US	Rochester	5	13	2020	34.0

47397 rows × 6 columns

# which cities do we have data for
temp_df['City'].unique()

array(['Lilongwe', 'Capetown', 'Tel Aviv', 'Amman', 'Beirut', 'Rochester'],
      dtype=object)

# isolate data from a single city (e.g., Tel Aviv)
city = 'Tel Aviv'
TA_temp = temp_df.loc[temp_df['City'] == city]
TA_temp

	Country	City	Month	Day	Year	AvgTemperature
14959	Israel	Tel Aviv	1	1	1995	57.3
14960	Israel	Tel Aviv	1	2	1995	56.1
14961	Israel	Tel Aviv	1	3	1995	55.9
14962	Israel	Tel Aviv	1	4	1995	56.9
14963	Israel	Tel Aviv	1	5	1995	56.6
...	...	...	...	...	...	...
19595	Israel	Tel Aviv	9	11	2007	79.5
19596	Israel	Tel Aviv	9	12	2007	79.7
19597	Israel	Tel Aviv	9	13	2007	79.7
19598	Israel	Tel Aviv	9	14	2007	79.6
19599	Israel	Tel Aviv	9	15	2007	80.0

4641 rows × 6 columns

# get summary statistics for a single city
TA_temp['AvgTemperature'].describe()

count    4641.000000
mean       54.020448
std        50.624184
min       -99.000000
25%        59.400000
50%        68.700000
75%        78.600000
max        88.500000
Name: AvgTemperature, dtype: float64

TA_temp = TA_temp.loc[TA_temp['AvgTemperature']>(-50)]

# convert to Celsius
TA_temp['AvgTemp_C'] = (TA_temp['AvgTemperature'] - 32)*(5/9)

/var/folders/wn/2bz1970d2w5182zy7h96yfcc0000gn/T/ipykernel_82655/3623248864.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  TA_temp['AvgTemp_C'] = (TA_temp['AvgTemperature'] - 32)*(5/9)

# get summary stasitics in Celsius
TA_temp['AvgTemp_C'].describe()

count    4196.000000
mean       21.249325
std         5.193370
min         7.277778
25%        16.555556
50%        21.611111
75%        26.277778
max        31.388889
Name: AvgTemp_C, dtype: float64

# get the mean for the city you chose
mean_temp = TA_temp['AvgTemp_C'].mean() 
print(f"The mean temperature in {city} is: {mean_temp:.2f} degrees Celcius")

The mean temperature in Tel Aviv is: 21.25 degrees Celcius

# get the median temperature for the city you chose
median_temp = TA_temp['AvgTemp_C'].median() 
print(f"The median temperature in {city} is: {median_temp:.2f} degrees Celcius")

The median temperature in Tel Aviv is: 21.61 degrees Celcius

# get the 10th percentile for the city you chose
percentile_10 = TA_temp['AvgTemp_C'].quantile(.1) 
print(f"The tenth percentile in {city} is: {percentile_10:.2f} degrees Celcius")

The tenth percentile in Tel Aviv is: 14.17 degrees Celcius

# get the 90th percentile for the city you chose
percentile_90 = TA_temp['AvgTemp_C'].quantile(.9) 
print(f"The ninetieth percentile in {city} is: {percentile_90:.2f} degrees Celcius")

The ninetieth percentile in Tel Aviv is: 27.67 degrees Celcius

# begin plotting
sns.set_theme(style="whitegrid")

# make a box plot of temperature for the city you chose
fig, ax = plt.subplots()
# ax = sns.boxplot(x=TA_temp.AvgTemp_C)
sns.boxplot(x=TA_temp['AvgTemp_C'], ax=ax)
ax.set(xlabel=f'Average Daily Temperature in {city}')

[Text(0.5, 0, 'Average Daily Temperature in Tel Aviv')]

# compare all the cities

# clean data
temp_df = temp_df.loc[temp_df['AvgTemperature']>(-50)]
temp_df['AvgTemp_C'] = (temp_df['AvgTemperature'] - 32)*(5/9)

# plot
fig, ax = plt.subplots()
sns.boxplot(x=temp_df['AvgTemp_C'], y=temp_df['City'], ax=ax)
ax.set(xlabel='Average Daily Temperature')

/var/folders/wn/2bz1970d2w5182zy7h96yfcc0000gn/T/ipykernel_82655/3601528637.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['AvgTemp_C'] = (temp_df['AvgTemperature'] - 32)*(5/9)

[Text(0.5, 0, 'Average Daily Temperature')]

# make a violin plot of the temperature
fig, ax = plt.subplots()
sns.violinplot(x=temp_df['AvgTemp_C'], y=temp_df['City'], ax=ax)
ax.set(xlabel='Average Daily Temperature')

[Text(0.5, 0, 'Average Daily Temperature')]

2.4 Try out

plot the Average Daily Temperature of the year 2000 for all cities
plot the Average Daily Temperature of January 1st for all cities in all years

Answer for 1

milenium = temp_df.loc[temp_df['Year'] == 2000]
# plot
fig, ax = plt.subplots()
sns.boxplot(data=milenium, x='AvgTemp_C', y='City', ax=ax)
ax.set(xlabel='Average Daily Temperature in 2000')

[Text(0.5, 0, 'Average Daily Temperature in 2000')]

Answer for 2

jan_df = temp_df.loc[(temp_df['Day'] == 1)&((temp_df['Month'] == 1))]
# plot
fig, ax = plt.subplots()
sns.boxplot(data=jan_df, x='AvgTemp_C', y='City', ax=ax)
ax.set(xlabel='Average Daily Temperature on Jan 1st')

[Text(0.5, 0, 'Average Daily Temperature on Jan 1st')]

# make a histogram of the data for the city you chose
fig, ax = plt.subplots()
sns.histplot(x=TA_temp['AvgTemp_C'], ax=ax)
ax.set(xlabel=f'Average Daily Temperature in {city}')

[Text(0.5, 0, 'Average Daily Temperature in Tel Aviv')]

# play around with the bin size for the histogram -- try more bins
fig, ax = plt.subplots()
sns.histplot(x=TA_temp['AvgTemp_C'], bins=100, ax=ax)
ax.set(xlabel='Average Daily Temperature')

[Text(0.5, 0, 'Average Daily Temperature')]

# now fewer bins
fig, ax = plt.subplots()
sns.histplot(x=TA_temp.AvgTemp_C, bins=10, ax=ax)
ax.set(xlabel='Average Daily Temperature')

[Text(0.5, 0, 'Average Daily Temperature')]

# add kernel density estimator
fig, ax = plt.subplots()
sns.histplot(x=TA_temp.AvgTemp_C, bins=20, kde = True, ax=ax)
ax.set(xlabel='Average Daily Temperature')

[Text(0.5, 0, 'Average Daily Temperature')]

# how can we normalize the histogram data?
fig, ax = plt.subplots()
sns.histplot(x=TA_temp['AvgTemp_C'], bins=20, kde=True, stat="density", ax=ax)
ax.set(xlabel='Average Daily Temperature')

[Text(0.5, 0, 'Average Daily Temperature')]

what’s the difference between the “density” stat and “probablity” stat? read the documentation.

# plotting 2 side by side
fig, ax = plt.subplots(1,2, sharey=True)

sns.histplot(x=TA_temp.AvgTemp_C, bins=20, kde=True, stat="probability", ax=ax[0])
ax[0].set(title='propability', xlabel='Average Daily Temperature')

sns.histplot(x=TA_temp.AvgTemp_C, bins=20, kde=True, stat="density", ax=ax[1])
ax[1].set(title='density', xlabel='Average Daily Temperature')

[Text(0.5, 1.0, 'density'), Text(0.5, 0, 'Average Daily Temperature')]

# add the mean to the plot
mean_temp = TA_temp['AvgTemp_C'].mean() 

fig, ax = plt.subplots()
sns.histplot(x=TA_temp['AvgTemp_C'], 
             bins=20,
             kde=True,
             stat="probability",
             ax=ax,
             )
ax.set(xlabel='Average Daily Temperature')
ax.axvline(mean_temp, label='mean', color='r')
ax.text(mean_temp,0.14, 'mean', va='bottom',
        ha='center', fontsize=14, weight='bold', color='r')

Text(21.249324753733717, 0.14, 'mean')

2.5 Try out

add the mode
add the median

# make a histogram of the data
fig, ax = plt.subplots()
sns.histplot(x=temp_df['AvgTemp_C'], hue=temp_df['City'], ax=ax)
ax.set(xlabel='Average Daily Temperature')

[Text(0.5, 0, 'Average Daily Temperature')]

# another type of histogram
fig, ax = plt.subplots()
sns.histplot(x=temp_df['AvgTemp_C'], y=temp_df['City'], 
             hue=temp_df['City'], legend=False, ax=ax)

<Axes: xlabel='AvgTemp_C', ylabel='City'>

2.6 Encoding categorical variables

Sometimes, for reasons that will be clear on the HW, we’ll want to encode our categorical variables so that they are numbers instead.

There are many ways that we can achive this.

Here will learn one, for more examples see: https://pbpython.com/categorical-encoding.html

# what are our cities again?
temp_df['City'].unique()

array(['Lilongwe', 'Capetown', 'Tel Aviv', 'Amman', 'Beirut', 'Rochester'],
      dtype=object)

# dictionary for encoding cities (note: we can encode more than one variable at a time)
cleanup_cities = {"City": {"Lilongwe": 1, 
                           "Capetown": 2,
                           "Tel Aviv": 3,
                           "Amman": 4,
                           "Beirut": 5,
                           "Rochester": 6}}

# new dataframe with encoded values
temp_df_encoded = temp_df.replace(cleanup_cities)
temp_df_encoded.dtypes

Country            object
City                int64
Month               int64
Day                 int64
Year                int64
AvgTemperature    float64
AvgTemp_C         float64
dtype: object

# option 2 -- use Pandas

# what are our data types?
temp_df.dtypes

Country            object
City               object
Month               int64
Day                 int64
Year                int64
AvgTemperature    float64
AvgTemp_C         float64
dtype: object

# assign city to be a categorical variable
temp_df["City"] = temp_df["City"].astype('category')
temp_df.dtypes

/var/folders/wn/2bz1970d2w5182zy7h96yfcc0000gn/T/ipykernel_82655/1724604918.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df["City"] = temp_df["City"].astype('category')

Country             object
City              category
Month                int64
Day                  int64
Year                 int64
AvgTemperature     float64
AvgTemp_C          float64
dtype: object

# use codes to encode variable
temp_df["City_encoded"] = temp_df["City"].cat.codes
temp_df.dtypes

/var/folders/wn/2bz1970d2w5182zy7h96yfcc0000gn/T/ipykernel_82655/3917241889.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df["City_encoded"] = temp_df["City"].cat.codes

Country             object
City              category
Month                int64
Day                  int64
Year                 int64
AvgTemperature     float64
AvgTemp_C          float64
City_encoded          int8
dtype: object