Homework 1

In this blog, I will use the NOAA climate data to create several in interactive data graphics.

1.Create a Database

Import data

import pandas as pd
import sqlite3

To determine whether the object has the right type of data

temps = pd.read_csv("temps_stacked.csv")
temps.head()

	ID	Year	Month	Temp
0	ACW00011604	1961	1	-0.89
1	ACW00011604	1961	2	2.36
2	ACW00011604	1961	3	4.72
3	ACW00011604	1961	4	7.73
4	ACW00011604	1961	5	11.28

stations = pd.read_csv('station-metadata.csv')
stations.head()

	ID	LATITUDE	LONGITUDE	STNELEV	NAME
0	ACW00011604	57.7667	11.8667	18.0	SAVE
1	AE000041196	25.3330	55.5170	34.0	SHARJAH_INTER_AIRP
2	AEM00041184	25.6170	55.9330	31.0	RAS_AL_KHAIMAH_INTE
3	AEM00041194	25.2550	55.3640	10.4	DUBAI_INTL
4	AEM00041216	24.4300	54.4700	3.0	ABU_DHABI_BATEEN_AIR

countries = pd.read_csv('countries.csv')
countries.head()

	FIPS 10-4	ISO 3166	Name
0	AF	AF	Afghanistan
1	AX	-	Akrotiri
2	AL	AL	Albania
3	AG	DZ	Algeria
4	AQ	AS	American Samoa

Whitespaces in column names are bad for SQL.Thus ,we need to rename all columns in a table removing whitespace.

# whitespaces in column names are bad for SQL
countries = countries.rename(columns= {"FIPS 10-4": "FIPS_10-4"})
countries = countries.rename(columns= {"ISO 3166": "ISO_3166"})
countries.head()

	FIPS_10-4	ISO_3166	Name
0	AF	AF	Afghanistan
1	AX	-	Akrotiri
2	AL	AL	Albania
3	AG	DZ	Algeria
4	AQ	AS	American Samoa

Write a Query Function

Creating a Connection object that represents the database, and store the data in the temps.db file.

# open a connection to temps.db 
conn = sqlite3.connect("temps.db")
temps.to_sql("temperatures", conn, if_exists="replace", index=False)
countries.to_sql("countries", conn, if_exists="replace", index=False)
stations.to_sql("stations", conn, if_exists="replace", index=False)
# close your connection
conn.close()

Define a function query_climate_database( ) to query the data from the temps.db.

def query_climate_database(country, year_begin, year_end, month):
    conn = sqlite3.connect("temps.db")
    cmd = \
    """
    SELECT S.name, S.latitude, S.longitude, C.name , T.year, T.month, T.temp
    FROM temperatures T
    LEFT JOIN stations S ON T.id = S.id
    LEFT JOIN countries C ON SUBSTRING(S.id,1,2) = C."fips_10-4"
    WHERE C.name = ? AND (T.year BETWEEN ? AND ?) AND T.month = ? 
    """
    df = pd.read_sql_query(cmd, conn, params=(country, year_begin, year_end, month))
    df = df.rename(columns= {"Name": "Country"})
    conn.close()
    return df
    

query_climate_database(country = "India", 
                       year_begin = 1980, 
                       year_end = 2020,
                       month = 1)

	NAME	LATITUDE	LONGITUDE	Country	Year	Month	Temp
0	PBO_ANANTAPUR	14.583	77.633	India	1980	1	23.48
1	PBO_ANANTAPUR	14.583	77.633	India	1981	1	24.57
2	PBO_ANANTAPUR	14.583	77.633	India	1982	1	24.19
3	PBO_ANANTAPUR	14.583	77.633	India	1983	1	23.51
4	PBO_ANANTAPUR	14.583	77.633	India	1984	1	24.81
...	...	...	...	...	...	...	...
3147	DARJEELING	27.050	88.270	India	1983	1	5.10
3148	DARJEELING	27.050	88.270	India	1986	1	6.90
3149	DARJEELING	27.050	88.270	India	1994	1	8.10
3150	DARJEELING	27.050	88.270	India	1995	1	5.60
3151	DARJEELING	27.050	88.270	India	1997	1	5.70

3152 rows × 7 columns

Write a Geographic Scatter Function for Yearly Temperature Increases

Linear regression is to predict a dependent variable value y based on a given independent variable x. We can use linear regression to find the linear relationship between year and temperature.

from sklearn.linear_model import LinearRegression
def coef(data_group):
    """
    This function is to find the slope of the line and that 
    corresponds to "average change in temp per year".
    """
    X = data_group[["Year"]]
    y = data_group["Temp"]
    LR = LinearRegression()
    LR.fit(X, y)
    slope = LR.coef_[0]
    return slope

Define a function temperature_coefficient_plot( ) to generate a interactive geographic scatterplot. This plot will shows an estimate of the yearly change in temperature during specified month and time period at the station.

from plotly import express as px
import numpy as np
def temperature_coefficient_plot(country, year_begin, year_end, month, min_obs, **kwargs):
    df=query_climate_database(country, year_begin, year_end, month)
    df = df[df.groupby(["NAME"])["Year"].transform(len) >= min_obs]
    df_1 = df.groupby(["NAME"])["LATITUDE"].aggregate(np.mean)
    df_2 = df.groupby(["NAME"])["LONGITUDE"].aggregate(np.mean)
    coefs = df.groupby(["NAME"]).apply(coef)
    df_3 = pd.DataFrame(coefs)
    df_4 = pd.concat([df_1,df_2,df_3], axis = 1)
    df_4.rename(columns = {0: "Estimated Yearly Increase (C)"}, inplace = True)
    df_4["Estimated Yearly Increase (C)"] = round(df_4["Estimated Yearly Increase (C)"],4)
    month_dict = {1:"January", 2:"February", 3:"March", 4:"April", 5:" May", 6:"June", 7:"July", 8:"August", 9:"September", 10:"October", 11:"November", 12:"December"}
    fig = px.scatter_mapbox(df_4, # data for the points you want to plot
                        lat = "LATITUDE", # column name for latitude informataion
                        lon = "LONGITUDE", # column name for longitude information
                        hover_name = "Estimated Yearly Increase (C)", # what's the bold text that appears when you hover over
                        color = "Estimated Yearly Increase (C)",
                        **kwargs)
    fig.update_layout(title = f"Estimates of yearly increase in temperature in {month_dict[1]} <br> for stations in {country}, years {year_begin} - {year_end}")
    return fig

Call the funtion temperature_coefficient_plot( ) to get the scatterplot.

color_map = px.colors.diverging.RdGy_r # choose a colormap
fig = temperature_coefficient_plot("India", 1980, 2020, 1, 
                                   min_obs = 10,
                                   zoom = 2,
                                   mapbox_style="carto-positron",
                                   color_continuous_scale=color_map
                                   )
fig.show()

Define a funtion query_climate_database_2( ) to query the data from temps.db. It will query the data during the specific time period.

def query_climate_database_2(year_begin, year_end):
    conn = sqlite3.connect("temps.db")
    cmd = \
    """
    SELECT S.name, S.latitude, S.longitude, C.name , T.year, T.month, T.temp
    FROM temperatures T
    LEFT JOIN stations S ON T.id = S.id
    LEFT JOIN countries C ON SUBSTRING(S.id,1,2) = C."fips_10-4"
    WHERE T.year BETWEEN ? AND ?
    """
    df = pd.read_sql_query(cmd, conn, params=(year_begin, year_end))
    df = df.rename(columns= {"Name": "Country"})
    conn.close()
    return df

To check the data that queried from temps.db.

df = query_climate_database_2(year_begin = 2015, year_end = 2020)
df

	NAME	LATITUDE	LONGITUDE	Country	Year	Month	Temp
0	SHARJAH_INTER_AIRP	25.333	55.517	United Arab Emirates	2015	1	19.16
1	SHARJAH_INTER_AIRP	25.333	55.517	United Arab Emirates	2015	2	22.21
2	SHARJAH_INTER_AIRP	25.333	55.517	United Arab Emirates	2015	3	23.52
3	SHARJAH_INTER_AIRP	25.333	55.517	United Arab Emirates	2015	4	27.90
4	SHARJAH_INTER_AIRP	25.333	55.517	United Arab Emirates	2015	5	32.91
...	...	...	...	...	...	...	...
847080	CHIPINGE	-20.200	32.616	Zimbabwe	2016	5	16.92
847081	CHIPINGE	-20.200	32.616	Zimbabwe	2016	6	16.81
847082	CHIPINGE	-20.200	32.616	Zimbabwe	2016	8	18.28
847083	CHIPINGE	-20.200	32.616	Zimbabwe	2016	10	23.34
847084	CHIPINGE	-20.200	32.616	Zimbabwe	2016	12	22.87

847085 rows × 7 columns

Define a function temperature_std_plot( ), and clean the data in this function. I will compute the standard deviation of temperature. Finally use plotly_express function toget a geographical scatter plot.

def temperature_std_plot(year_begin, year_end,  **kwargs):
    df = query_climate_database_2(year_begin, year_end)
    # to get the mean temperature for each year
    df['mean_temp'] = df.groupby(["NAME", "Year"])["Temp"].transform(np.mean)
    # to get the standard deviation for each year
    df["std"] = df.groupby(["NAME", "Year"])["Temp"].transform(np.std)
    df["std"] = round(df["std"],4)
    # drop the row that has same NAME and Year
    df_1 = df.drop_duplicates(subset=['NAME', 'Year'])
    # drop the column
    df_2 = df_1.drop(["Month","Temp"], axis=1)
    # drop the row that has NaN
    df_2 = df_2.dropna()
    fig = px.scatter_geo(df_2,
                     lat = "LATITUDE",
                     lon = "LONGITUDE",
                     color ="Country",
                     hover_name ="NAME",
                     size = "std",
                     size_max = 10,
                     animation_frame ="Year",
                     projection ="natural earth")
    fig.update_layout(title = f"The standard deviation of temperature in different country from {year_begin} to {year_end}.")
    return fig

Call the function temperature_std_plot( ), and then get a scatter plot.

fig = temperature_std_plot(2015, 2020)
fig.show()

4.Create Two More Interesting Figures

(1)The Plot Question: How does the temperature of each city in a country monthly change?

Dedine a function f( ) query the data from the certain year in a country, and plot an interative lineplot. From this plot, we can see the temperature monthly change.

def f(country, year):
    """
    This function is to get the data from the certain year in a country,
    and plot the temperature for each city in a country at a certain year.
    """
    conn = sqlite3.connect("temps.db")
    cmd = \
    """
    SELECT S.name, C.name , T.year,  T.temp, T.month
    FROM temperatures T
    LEFT JOIN stations S ON T.id = S.id
    LEFT JOIN countries C ON SUBSTRING(S.id,1,2) = C."fips_10-4"
    WHERE C.name = ? AND T.year = ? 
    """
    df = pd.read_sql_query(cmd, conn, params=(country, year))
    df = df.rename(columns= {"Name": "Country"})
    conn.close()
    fig = px.line(data_frame = df, # data that needs to be plotted
                 x = "Month", # column name for x-axis
                 y = "Temp", # column name for y-axis
                 color = "NAME", # column name for color coding
                 title = f"In {year},the temperature for each city in {country} "
                )
    return fig
fig = f(country = "Italy", year = 2020)
fig.show()

(2) The plot question: How has mean temperature changed between two countries ?

Define a function g( ) to query the data from two countries, and plot an interactive boxplot.

def g(country_1, country_2):
    """
    This function is to get the data from two countries,and compute 
    the mean temperature of each month in each city from these two 
    countries.Then graph the box plot.
    """
    conn = sqlite3.connect("temps.db")
    cmd = \
    """
    SELECT S.name,S.latitude, C.name , T.year,  T.temp, T.month
    FROM temperatures T
    LEFT JOIN stations S ON T.id = S.id
    LEFT JOIN countries C ON SUBSTRING(S.id,1,2) = C."fips_10-4"
    WHERE C.name IN (?,?)
    """
    df = pd.read_sql_query(cmd, conn, params=(country_1,country_2))
    df = df.rename(columns= {"Name": "Country"})
    conn.close()
    df = df.groupby(["Country","NAME","Month"])["Temp"].aggregate(np.mean)
    df = df.reset_index()
    df["Temp"] = round(df["Temp"],4)
    df = df.rename(columns= {"Temp": "mean_temp"})
    fig = px.box(df, 
             x = "Month", 
             y = "mean_temp",
             facet_row = "Country",
             width = 1000,
             height = 500,
             title = f"The mean temperature for each month in {country_1} and {country_2}")
    return fig
fig = g(country_1 = "Italy", country_2 = "Japan")
fig.show()

From boxplot, we can see that there is no outlier outside the max mean_temperature in Italy for each month. All outliers are outside min mean_temperature in Italy for each month.

Written on April 1, 2022