Homework 1
In this blog, I will use the NOAA climate data to create several in interactive data graphics.
1.Create a Database
Import data
import pandas as pd
import sqlite3
To determine whether the object has the right type of data
temps = pd.read_csv("temps_stacked.csv")
temps.head()
ID | Year | Month | Temp | |
---|---|---|---|---|
0 | ACW00011604 | 1961 | 1 | -0.89 |
1 | ACW00011604 | 1961 | 2 | 2.36 |
2 | ACW00011604 | 1961 | 3 | 4.72 |
3 | ACW00011604 | 1961 | 4 | 7.73 |
4 | ACW00011604 | 1961 | 5 | 11.28 |
stations = pd.read_csv('station-metadata.csv')
stations.head()
ID | LATITUDE | LONGITUDE | STNELEV | NAME | |
---|---|---|---|---|---|
0 | ACW00011604 | 57.7667 | 11.8667 | 18.0 | SAVE |
1 | AE000041196 | 25.3330 | 55.5170 | 34.0 | SHARJAH_INTER_AIRP |
2 | AEM00041184 | 25.6170 | 55.9330 | 31.0 | RAS_AL_KHAIMAH_INTE |
3 | AEM00041194 | 25.2550 | 55.3640 | 10.4 | DUBAI_INTL |
4 | AEM00041216 | 24.4300 | 54.4700 | 3.0 | ABU_DHABI_BATEEN_AIR |
countries = pd.read_csv('countries.csv')
countries.head()
FIPS 10-4 | ISO 3166 | Name | |
---|---|---|---|
0 | AF | AF | Afghanistan |
1 | AX | - | Akrotiri |
2 | AL | AL | Albania |
3 | AG | DZ | Algeria |
4 | AQ | AS | American Samoa |
Whitespaces in column names are bad for SQL.Thus ,we need to rename all columns in a table removing whitespace.
# whitespaces in column names are bad for SQL
countries = countries.rename(columns= {"FIPS 10-4": "FIPS_10-4"})
countries = countries.rename(columns= {"ISO 3166": "ISO_3166"})
countries.head()
FIPS_10-4 | ISO_3166 | Name | |
---|---|---|---|
0 | AF | AF | Afghanistan |
1 | AX | - | Akrotiri |
2 | AL | AL | Albania |
3 | AG | DZ | Algeria |
4 | AQ | AS | American Samoa |
Write a Query Function
Creating a Connection object that represents the database, and store the data in the temps.db file.
# open a connection to temps.db
conn = sqlite3.connect("temps.db")
temps.to_sql("temperatures", conn, if_exists="replace", index=False)
countries.to_sql("countries", conn, if_exists="replace", index=False)
stations.to_sql("stations", conn, if_exists="replace", index=False)
# close your connection
conn.close()
Define a function query_climate_database( ) to query the data from the temps.db.
def query_climate_database(country, year_begin, year_end, month):
conn = sqlite3.connect("temps.db")
cmd = \
"""
SELECT S.name, S.latitude, S.longitude, C.name , T.year, T.month, T.temp
FROM temperatures T
LEFT JOIN stations S ON T.id = S.id
LEFT JOIN countries C ON SUBSTRING(S.id,1,2) = C."fips_10-4"
WHERE C.name = ? AND (T.year BETWEEN ? AND ?) AND T.month = ?
"""
df = pd.read_sql_query(cmd, conn, params=(country, year_begin, year_end, month))
df = df.rename(columns= {"Name": "Country"})
conn.close()
return df
query_climate_database(country = "India",
year_begin = 1980,
year_end = 2020,
month = 1)
NAME | LATITUDE | LONGITUDE | Country | Year | Month | Temp | |
---|---|---|---|---|---|---|---|
0 | PBO_ANANTAPUR | 14.583 | 77.633 | India | 1980 | 1 | 23.48 |
1 | PBO_ANANTAPUR | 14.583 | 77.633 | India | 1981 | 1 | 24.57 |
2 | PBO_ANANTAPUR | 14.583 | 77.633 | India | 1982 | 1 | 24.19 |
3 | PBO_ANANTAPUR | 14.583 | 77.633 | India | 1983 | 1 | 23.51 |
4 | PBO_ANANTAPUR | 14.583 | 77.633 | India | 1984 | 1 | 24.81 |
... | ... | ... | ... | ... | ... | ... | ... |
3147 | DARJEELING | 27.050 | 88.270 | India | 1983 | 1 | 5.10 |
3148 | DARJEELING | 27.050 | 88.270 | India | 1986 | 1 | 6.90 |
3149 | DARJEELING | 27.050 | 88.270 | India | 1994 | 1 | 8.10 |
3150 | DARJEELING | 27.050 | 88.270 | India | 1995 | 1 | 5.60 |
3151 | DARJEELING | 27.050 | 88.270 | India | 1997 | 1 | 5.70 |
3152 rows × 7 columns
Write a Geographic Scatter Function for Yearly Temperature Increases
Linear regression is to predict a dependent variable value y based on a given independent variable x. We can use linear regression to find the linear relationship between year and temperature.
from sklearn.linear_model import LinearRegression
def coef(data_group):
"""
This function is to find the slope of the line and that
corresponds to "average change in temp per year".
"""
X = data_group[["Year"]]
y = data_group["Temp"]
LR = LinearRegression()
LR.fit(X, y)
slope = LR.coef_[0]
return slope
Define a function temperature_coefficient_plot( ) to generate a interactive geographic scatterplot. This plot will shows an estimate of the yearly change in temperature during specified month and time period at the station.
from plotly import express as px
import numpy as np
def temperature_coefficient_plot(country, year_begin, year_end, month, min_obs, **kwargs):
df=query_climate_database(country, year_begin, year_end, month)
df = df[df.groupby(["NAME"])["Year"].transform(len) >= min_obs]
df_1 = df.groupby(["NAME"])["LATITUDE"].aggregate(np.mean)
df_2 = df.groupby(["NAME"])["LONGITUDE"].aggregate(np.mean)
coefs = df.groupby(["NAME"]).apply(coef)
df_3 = pd.DataFrame(coefs)
df_4 = pd.concat([df_1,df_2,df_3], axis = 1)
df_4.rename(columns = {0: "Estimated Yearly Increase (C)"}, inplace = True)
df_4["Estimated Yearly Increase (C)"] = round(df_4["Estimated Yearly Increase (C)"],4)
month_dict = {1:"January", 2:"February", 3:"March", 4:"April", 5:" May", 6:"June", 7:"July", 8:"August", 9:"September", 10:"October", 11:"November", 12:"December"}
fig = px.scatter_mapbox(df_4, # data for the points you want to plot
lat = "LATITUDE", # column name for latitude informataion
lon = "LONGITUDE", # column name for longitude information
hover_name = "Estimated Yearly Increase (C)", # what's the bold text that appears when you hover over
color = "Estimated Yearly Increase (C)",
**kwargs)
fig.update_layout(title = f"Estimates of yearly increase in temperature in {month_dict[1]} <br> for stations in {country}, years {year_begin} - {year_end}")
return fig
Call the funtion temperature_coefficient_plot( ) to get the scatterplot.
color_map = px.colors.diverging.RdGy_r # choose a colormap
fig = temperature_coefficient_plot("India", 1980, 2020, 1,
min_obs = 10,
zoom = 2,
mapbox_style="carto-positron",
color_continuous_scale=color_map
)
fig.show()
Define a funtion query_climate_database_2( ) to query the data from temps.db. It will query the data during the specific time period.
def query_climate_database_2(year_begin, year_end):
conn = sqlite3.connect("temps.db")
cmd = \
"""
SELECT S.name, S.latitude, S.longitude, C.name , T.year, T.month, T.temp
FROM temperatures T
LEFT JOIN stations S ON T.id = S.id
LEFT JOIN countries C ON SUBSTRING(S.id,1,2) = C."fips_10-4"
WHERE T.year BETWEEN ? AND ?
"""
df = pd.read_sql_query(cmd, conn, params=(year_begin, year_end))
df = df.rename(columns= {"Name": "Country"})
conn.close()
return df
To check the data that queried from temps.db.
df = query_climate_database_2(year_begin = 2015, year_end = 2020)
df
NAME | LATITUDE | LONGITUDE | Country | Year | Month | Temp | |
---|---|---|---|---|---|---|---|
0 | SHARJAH_INTER_AIRP | 25.333 | 55.517 | United Arab Emirates | 2015 | 1 | 19.16 |
1 | SHARJAH_INTER_AIRP | 25.333 | 55.517 | United Arab Emirates | 2015 | 2 | 22.21 |
2 | SHARJAH_INTER_AIRP | 25.333 | 55.517 | United Arab Emirates | 2015 | 3 | 23.52 |
3 | SHARJAH_INTER_AIRP | 25.333 | 55.517 | United Arab Emirates | 2015 | 4 | 27.90 |
4 | SHARJAH_INTER_AIRP | 25.333 | 55.517 | United Arab Emirates | 2015 | 5 | 32.91 |
... | ... | ... | ... | ... | ... | ... | ... |
847080 | CHIPINGE | -20.200 | 32.616 | Zimbabwe | 2016 | 5 | 16.92 |
847081 | CHIPINGE | -20.200 | 32.616 | Zimbabwe | 2016 | 6 | 16.81 |
847082 | CHIPINGE | -20.200 | 32.616 | Zimbabwe | 2016 | 8 | 18.28 |
847083 | CHIPINGE | -20.200 | 32.616 | Zimbabwe | 2016 | 10 | 23.34 |
847084 | CHIPINGE | -20.200 | 32.616 | Zimbabwe | 2016 | 12 | 22.87 |
847085 rows × 7 columns
Define a function temperature_std_plot( ), and clean the data in this function. I will compute the standard deviation of temperature. Finally use plotly_express function toget a geographical scatter plot.
def temperature_std_plot(year_begin, year_end, **kwargs):
df = query_climate_database_2(year_begin, year_end)
# to get the mean temperature for each year
df['mean_temp'] = df.groupby(["NAME", "Year"])["Temp"].transform(np.mean)
# to get the standard deviation for each year
df["std"] = df.groupby(["NAME", "Year"])["Temp"].transform(np.std)
df["std"] = round(df["std"],4)
# drop the row that has same NAME and Year
df_1 = df.drop_duplicates(subset=['NAME', 'Year'])
# drop the column
df_2 = df_1.drop(["Month","Temp"], axis=1)
# drop the row that has NaN
df_2 = df_2.dropna()
fig = px.scatter_geo(df_2,
lat = "LATITUDE",
lon = "LONGITUDE",
color ="Country",
hover_name ="NAME",
size = "std",
size_max = 10,
animation_frame ="Year",
projection ="natural earth")
fig.update_layout(title = f"The standard deviation of temperature in different country from {year_begin} to {year_end}.")
return fig
Call the function temperature_std_plot( ), and then get a scatter plot.
fig = temperature_std_plot(2015, 2020)
fig.show()
4.Create Two More Interesting Figures
(1)The Plot Question: How does the temperature of each city in a country monthly change?
Dedine a function f( ) query the data from the certain year in a country, and plot an interative lineplot. From this plot, we can see the temperature monthly change.
def f(country, year):
"""
This function is to get the data from the certain year in a country,
and plot the temperature for each city in a country at a certain year.
"""
conn = sqlite3.connect("temps.db")
cmd = \
"""
SELECT S.name, C.name , T.year, T.temp, T.month
FROM temperatures T
LEFT JOIN stations S ON T.id = S.id
LEFT JOIN countries C ON SUBSTRING(S.id,1,2) = C."fips_10-4"
WHERE C.name = ? AND T.year = ?
"""
df = pd.read_sql_query(cmd, conn, params=(country, year))
df = df.rename(columns= {"Name": "Country"})
conn.close()
fig = px.line(data_frame = df, # data that needs to be plotted
x = "Month", # column name for x-axis
y = "Temp", # column name for y-axis
color = "NAME", # column name for color coding
title = f"In {year},the temperature for each city in {country} "
)
return fig
fig = f(country = "Italy", year = 2020)
fig.show()
(2) The plot question: How has mean temperature changed between two countries ?
Define a function g( ) to query the data from two countries, and plot an interactive boxplot.
def g(country_1, country_2):
"""
This function is to get the data from two countries,and compute
the mean temperature of each month in each city from these two
countries.Then graph the box plot.
"""
conn = sqlite3.connect("temps.db")
cmd = \
"""
SELECT S.name,S.latitude, C.name , T.year, T.temp, T.month
FROM temperatures T
LEFT JOIN stations S ON T.id = S.id
LEFT JOIN countries C ON SUBSTRING(S.id,1,2) = C."fips_10-4"
WHERE C.name IN (?,?)
"""
df = pd.read_sql_query(cmd, conn, params=(country_1,country_2))
df = df.rename(columns= {"Name": "Country"})
conn.close()
df = df.groupby(["Country","NAME","Month"])["Temp"].aggregate(np.mean)
df = df.reset_index()
df["Temp"] = round(df["Temp"],4)
df = df.rename(columns= {"Temp": "mean_temp"})
fig = px.box(df,
x = "Month",
y = "mean_temp",
facet_row = "Country",
width = 1000,
height = 500,
title = f"The mean temperature for each month in {country_1} and {country_2}")
return fig
fig = g(country_1 = "Italy", country_2 = "Japan")
fig.show()
From boxplot, we can see that there is no outlier outside the max mean_temperature in Italy for each month. All outliers are outside min mean_temperature in Italy for each month.