I use data science and computational social science to learn about the world, answer questions, and solve problems. I am interested in exploring how computational social science and data science can expand and enrich decision making abilities and improve public policies. I work in the international development field as a research analyst. I am also a graduate student at the University of Washington, specializing in data science.
Before moving to Seattle, I lived in Central America where I spent the majority of my time volunteering with NGOs ranging from community development projects to agroforestry operations. I ultimately ended up in Panama City and worked as the Logistics and Mapping Coordinator for an environmental policy think tank, designing and implementing a metropolitan-wide recycling program. I received my bachelor's degree in Urban Planning & Sustainable Development and a minor in Economics from Western Washington University in 2014.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from scipy.stats import t
from IPython.display import display, HTML
# Make the notebook's cells larger
display(HTML("<style>.container { width:70% !important; }</style>"))
traffic = pd.read_csv('CombinedTrafficData_Imputed_Values.csv')
traffic.head()
# Convert Date string into a Date Format
traffic['Date'] = pd.to_datetime(traffic['Date'])
# Select January, February, and March as our period of Observation
jfm_traffic = traffic[(pd.DatetimeIndex(traffic.Date).month < 4)]
# Add a separate column for Year, Month, and Day
jfm_traffic = jfm_traffic.assign(Year = pd.DatetimeIndex(jfm_traffic.Date).year,
Month = pd.DatetimeIndex(jfm_traffic.Date).month,
Day = pd.DatetimeIndex(jfm_traffic.Date).day)
# Group the data by date (Year, Month, Day) to collapse Travel Direction.
# Before grouping, there are two rows for each date, one for each direction of travel.
grouped_traffic = jfm_traffic.groupby(['Year','Month','Day'], as_index= False).sum()
# Add a new column that computes the total number of vehicles per day for each observation.
grouped_traffic['NumVehicles'] = grouped_traffic.iloc[:,3:27].sum(axis=1)
grouped_traffic.sample(5)
%%HTML
<div class='tableauPlaceholder' id='viz1596763938375' style='position: relative'><noscript><a href='#'><img alt=' ' src='https://public.tableau.com/static/images/5B/5BRFQMPTH/1_rss.png' style='border: none' /></a></noscript><object class='tableauViz' style='display:none;'><param name='host_url' value='https%3A%2F%2Fpublic.tableau.com%2F' /> <param name='embed_code_version' value='3' /> <param name='path' value='shared/5BRFQMPTH' /> <param name='toolbar' value='yes' /><param name='static_image' value='https://public.tableau.com/static/images/5B/5BRFQMPTH/1.png' /> <param name='animate_transition' value='yes' /><param name='display_static_image' value='yes' /><param name='display_spinner' value='yes' /><param name='display_overlay' value='yes' /><param name='display_count' value='yes' /><param name='language' value='en' /></object></div> <script type='text/javascript'> var divElement = document.getElementById('viz1596763938375'); var vizElement = divElement.getElementsByTagName('object')[0]; vizElement.style.width='100%';vizElement.style.height=(divElement.offsetWidth*0.75)+'px'; var scriptElement = document.createElement('script'); scriptElement.src = 'https://public.tableau.com/javascripts/api/viz_v1.js'; vizElement.parentNode.insertBefore(scriptElement, vizElement); </script>
# Split traffic into 2018-2019 grouping and 2020 grouping
traffic_2020 = grouped_traffic.loc[grouped_traffic['Year'] == 2020]
traffic_2018_2019 = grouped_traffic.loc[grouped_traffic['Year'] != 2020]
# Distribution of Traffic Volumes across the two groups measured by standard deviation
np.std(traffic_2020['NumVehicles']), np.std(traffic_2018_2019['NumVehicles'])
# The standard deviations are not equal - should use Welch's 2 sample, one-sided t-test.
# Create a function that calculates a one-tailed Welch's test statistic
def tstat(group1, group2):
avg_g1 = np.mean(group1['NumVehicles'])
avg_g2 = np.mean(group2['NumVehicles'])
std_g1 = np.std(group1['NumVehicles'], ddof = 1)
std_g2 = np.std(group2['NumVehicles'], ddof = 1)
num_obs_g1 = len(group1['NumVehicles'])
num_obs_g2 = len(group2['NumVehicles'])
numerator = avg_g1 - avg_g2
denominator = np.sqrt( (std_g1**2/num_obs_g1) + (std_g2**2/num_obs_g2) )
degrees_freedom = ((std_g1**2/num_obs_g1) +
(std_g2**2/num_obs_g2))**2 / ((std_g1**4)/((num_obs_g1**2)*(num_obs_g1-1)) +
(std_g2**4)/((num_obs_g2**2)*(num_obs_g2-1)))
welchs_t_stat = numerator/denominator
return welchs_t_stat, degrees_freedom
print("T-Statistic:{}\nDegrees of Freedom: {}".format(tstat(traffic_2020, traffic_2018_2019)[0],
tstat(traffic_2020, traffic_2018_2019)[1]))
jfm_pval = stats.t.sf(np.abs(tstat(traffic_2020, traffic_2018_2019)[0]), tstat(traffic_2020, traffic_2018_2019)[1])
print("P Value: {}".format('%.10f' % jfm_pval))
# Create a copy of the dataframe
jfm_traffic_weekdays = jfm_traffic.copy()
# Remove Saturday and Sundays from the data
jfm_traffic_weekdays = jfm_traffic_weekdays.loc[(jfm_traffic_weekdays.DayOfWeek != 'Saturday') &
(jfm_traffic_weekdays.DayOfWeek != 'Sunday')]
# Group the data by date and add a column to calculate the total number of vehicles per day for each observation.
grouped_traffic_weekdays = jfm_traffic_weekdays.groupby(['Year','Month','Day'], as_index= False).sum()
grouped_traffic_weekdays['NumVehicles'] = grouped_traffic_weekdays.iloc[:,3:27].sum(axis=1)
grouped_traffic_weekdays.sample(5)
# Split traffic into 2018-2019 grouping and 2020 grouping
traffic_2020_weekdays = grouped_traffic_weekdays.loc[grouped_traffic_weekdays['Year'] == 2020]
traffic_2018_2019_weekdays = grouped_traffic_weekdays.loc[grouped_traffic_weekdays['Year'] != 2020]
print("T-Statistic:{}\nDegrees of Freedom: {}".format(tstat(traffic_2020, traffic_2018_2019_weekdays)[0],
tstat(traffic_2020, traffic_2018_2019_weekdays)[1]))
weekday_pval = stats.t.sf(np.abs(tstat(traffic_2020_weekdays, traffic_2018_2019_weekdays)[0]),
tstat(traffic_2020_weekdays, traffic_2018_2019_weekdays)[1])
print("P Value: {}".format('%.10f' % weekday_pval))
# Follow similar steps above, grouping by date and creating a column to calculate
# the total number of vehicles per day for each observation. Then exlude January and February.
mar_traffic_weekdays = jfm_traffic_weekdays.copy()
mar_traffic_weekdays = mar_traffic_weekdays[(pd.DatetimeIndex(mar_traffic_weekdays.Date).month == 3)]
mar_grouped_traffic_weekdays = mar_traffic_weekdays.groupby(['Year','Month','Day'], as_index= False).sum()
mar_grouped_traffic_weekdays['NumVehicles'] = mar_grouped_traffic_weekdays.iloc[:,3:27].sum(axis=1)
mar_grouped_traffic_weekdays.sample(5)
# Split traffic into 2018-2019 grouping and 2020 grouping
mar_traffic_2020_weekdays = mar_grouped_traffic_weekdays.loc[mar_grouped_traffic_weekdays['Year'] == 2020]
mar_traffic_2018_2019_weekdays = mar_grouped_traffic_weekdays.loc[mar_grouped_traffic_weekdays['Year'] != 2020]
print("T-Statistic:{}\nDegrees of Freedom: {}".format(tstat(mar_traffic_2020_weekdays, mar_traffic_2018_2019_weekdays)[0],
tstat(mar_traffic_2020_weekdays, mar_traffic_2018_2019_weekdays)[1]))
mar_weekday_pval = stats.t.sf(np.abs(tstat(mar_traffic_2020_weekdays, mar_traffic_2018_2019_weekdays)[0]),
tstat(mar_traffic_2020_weekdays, mar_traffic_2018_2019_weekdays)[1])
print("P Value: {}".format('%.10f' % mar_weekday_pval))
print("P Value for All Days, January 1 - March 31: {}\nP Value for Weekdays Only, January 1 - March 31: {}\n"
"P Value for Weekdays Only, March 1 - March 31: {}".format('%.10f' % jfm_pval, '%.10f' % weekday_pval, '%.10f' % mar_weekday_pval))
plt.figure(figsize=(8, 8))
plt.scatter(['All Months\nAll Days', 'All Months\nWeekdays', 'March\nWeekdays'],
[jfm_pval*10000000, weekday_pval*10000000, mar_weekday_pval*10000000])
plt.title('Rescaled P-Values')
plt.ylabel('P-Values multiplied by 10M')
All of these sites are excluded from the analysis
The remaining sites (R046, R117, and S502) all had missing data values for certain days or direction (north/south, east/west) on a certain day. These missing values were imputed by calculating the mean traffic volumes for that day of the week on the year/month combination. For example, if there was no data for Sunday March 11, 2018, the mean traffic volume for Sundays in March of 2018 was calculated and assumed to be the traffic volume for that day. The time of day was ignored, and each hour was given the same traffic volume such that each hour summed together for Sunday March 11, 2018 would equal the average traffic volume for Sundays in March 2018. To ensure that values were whole numbers and estimates were conservate, the FLOOR function was used" FLOOR((Avg Num Vehicles/24),1)