How I Tracked Down a Peculiar Problem in a Dataset Using Visualizations: Behind the Scenes

This is how I created some Visualizations with altair in python.

In [69]:
import pandas as pd
import altair as alt
import altair_saver

Reading in our dataset, we will only use the first 50,000,000 rows so we don't have memory issues.

In [70]:
review_big = pd.read_csv('C:/Users/travis/Downloads/review_analysis.csv', usecols = ['review_date', 'fandom_id'], nrows = 50_000_000, engine ='c')
review_big.head(2)
Out[70]:
review_date fandom_id
0 1460773579 2161
1 1461119701 2161
In [71]:
review_big['review_date'] = pd.to_datetime(review_big['review_date'], unit = 's')
review_big['year'] = review_big['review_date'].dt.year
review_big['hour'] = review_big['review_date'].dt.hour
review_big['month'] = review_big['review_date'].dt.month
review_big['day'] = review_big['review_date'].dt.day

Below I pick a some fandoms that are all similar size and show different severities of the problem.

In [72]:
fandom_one = review_big[review_big['fandom_id'].isin([2, 5, 6, 27])].copy() 
fandom_one.fandom_id.value_counts()
Out[72]:
6     487447
2     473183
5     463940
27    107268
Name: fandom_id, dtype: int64
In [73]:
fandom_one_dist = fandom_one.groupby(['fandom_id', 'hour']).count().reset_index()
fandom_one_dist
Out[73]:
fandom_id hour review_date year month day
0 2 0 18483 18483 18483 18483
1 2 1 19527 19527 19527 19527
2 2 2 20794 20794 20794 20794
3 2 3 21282 21282 21282 21282
4 2 4 19711 19711 19711 19711
... ... ... ... ... ... ...
91 27 19 4359 4359 4359 4359
92 27 20 4823 4823 4823 4823
93 27 21 5072 5072 5072 5072
94 27 22 5157 5157 5157 5157
95 27 23 5399 5399 5399 5399

96 rows × 6 columns

In [74]:
fandom_one_vis = alt.Chart(fandom_one_dist).mark_bar().encode(
    x = alt.X('hour:O', title = "Hour (UTC)", axis= alt.Axis(labelAngle=0)),
    y = alt.Y('year:Q', title = "Review Count"),
    column = alt.Column('fandom_id:O', title = "Fandom ID")
).properties(
    title="Reviews by Hour of Day for Various Fandoms",
    width=200, 
    height = 200
)
altair_saver.save(fandom_one_vis, 'initial.png', scale_factor = 5)
fandom_one_vis
Out[74]:

Now we will move on to exploring the distribution across years.

In [75]:
hour_year_count = review_big.groupby(['hour', 'year']).count()
hour_year_count = hour_year_count.reset_index()
count = hour_year_count.groupby('year').sum()['review_date'].rename('year_sum')
hour_year_count = hour_year_count.join(count, on='year')
hour_year_count.head(3)
Out[75]:
hour year review_date fandom_id month day year_sum
0 0 1970 31 31 31 31 31
1 0 2011 10033 10033 10033 10033 3843809
2 0 2012 275515 275515 275515 275515 5290767
In [76]:
hour_year_count['percentage_annual'] = hour_year_count['review_date'] / hour_year_count['year_sum'] * 100
hour_year_count = hour_year_count[hour_year_count['year'] > 1999].copy()
def color_opposite(percentage):
    if percentage > 10:
        return "white"
    return "black"
hour_year_count = hour_year_count.set_index(['year', 'hour'])['percentage_annual'].unstack(fill_value=0).stack().reset_index(name='percentage_annual')
hour_year_count['color'] = hour_year_count['percentage_annual'].apply(color_opposite)
# hour_year_count['color'] = hour_year_count['percentage_annual'].apply(color_opposite)
hour_year_count.head(3)
Out[76]:
year hour percentage_annual color
0 2000 0 0.0 black
1 2000 1 0.0 black
2 2000 2 0.0 black
In [221]:
 
In [77]:
color = alt.Chart(hour_year_count).mark_rect().encode(
    x = alt.X('hour:O', title = "Hour (UTC)", axis= alt.Axis(labelAngle=0)),
    y = alt.Y('year:O', title = "Year"),
    color = alt.condition(alt.datum.percentage_annual == 0, alt.value('lightgrey'), alt.Color('percentage_annual', title = None, legend=None))
)

text = alt.Chart(hour_year_count).mark_text().encode(
    x = alt.X('hour:O', title = "Hour (UTC)"),
    y = alt.Y('year:O', title = "Year"),
    color= alt.Color('color', scale=None),
    # color = alt.Color('percentage_annual', scale=alt.Scale(domain = [15, 8], scheme='greys')),
    text = alt.Text('percentage_annual', format=".1f")
)

annual_chart = alt.layer(
    color, 
    text
).resolve_scale(
    color='independent'
).properties(
    width = 800,
    height = 400,
    title = "Hourly Review Distibution by Year"
)

altair_saver.save(annual_chart, 'annual.png', scale_factor = 5)
annual_chart
Out[77]:

Now we will look at the distribution by month!

In [78]:
hour_month_count = review_big[review_big['year'] == 2011].groupby(['hour', 'month']).count()
hour_month_count = hour_month_count.reset_index()
count = hour_month_count.groupby('month').sum()['review_date'].rename('month_sum')
count
hour_month_count = hour_month_count.join(count, on='month')
hour_month_count['percentage_annual'] = hour_month_count['review_date'] / hour_month_count['month_sum'] * 100
hour_month_count = hour_month_count.set_index(['month', 'hour'])['percentage_annual'].unstack(fill_value=0).stack().reset_index(name='percentage_annual')
hour_month_count['color'] = hour_month_count['percentage_annual'].apply(color_opposite)

hour_month_count.head(2)
Out[78]:
month hour percentage_annual color
0 1 0 0.0 black
1 1 1 0.0 black
In [79]:
color = alt.Chart(hour_month_count).mark_rect().encode(
    x = alt.X('hour:O', title = "Hour (UTC)", axis= alt.Axis(labelAngle=0)),
    y = alt.Y('month:O', title = "Month"),
    color = alt.condition(alt.datum.percentage_annual == 0, alt.value('lightgrey'), alt.Color('percentage_annual', title = None, legend=None))
)

text = alt.Chart(hour_month_count).mark_text().encode(
    x = alt.X('hour:O', title = "Hour (UTC)"),
    y = alt.Y('month:O', title = "Month"),
    color= alt.Color('color', scale=None),
    # color = alt.Color('percentage_annual', scale=alt.Scale(domain = [15, 8], scheme='greys')),
    text = alt.Text('percentage_annual', format=".1f")
)
monthly_chart = alt.layer(
    color, 
    text
).resolve_scale(
    color='independent'
).properties(
    width = 800,
    height = 300,
    title = "Hourly Review Distibution by Month in 2011"
)

altair_saver.save(monthly_chart, 'monthly.png', scale_factor = 5)
monthly_chart
Out[79]:

Now that I have created the three visualisation to find the problem, I will build an interactive version that ties them all together and facilitates further exploration. First I will have to prepare each the data. For each graph I will have one dataframe that includes values for each possible selection the graph could have (the month will have months for all years). Then I will use filters to hide all the months that aren't selected.

In [80]:
hour_month_count = review_big[review_big['year'] > 1999].copy().groupby(['year', 'month', 'hour' ]).count()
hour_month_count = hour_month_count.reset_index()
count = hour_month_count.groupby(['year', 'month']).sum()['review_date'].rename('month_sum')
count
hour_month_count = hour_month_count.join(count, on=['year', 'month'])
hour_month_count['percentage_annual'] = hour_month_count['review_date'] / hour_month_count['month_sum'] * 100
hour_month_count = hour_month_count.set_index(['year', 'month', 'hour'])['percentage_annual'].unstack(fill_value=0).stack().reset_index(name='percentage_annual')
hour_month_count['color'] = hour_month_count['percentage_annual'].apply(color_opposite)
hour_month_count.head(2)
Out[80]:
year month hour percentage_annual color
0 2000 4 0 0.0 black
1 2000 4 1 0.0 black
In [81]:
hour_day_count = review_big[review_big['year'] > 1999].copy().groupby(['year', 'month', 'day', 'hour' ]).count()
hour_day_count = hour_day_count.reset_index()
hour_count = hour_day_count.groupby(['year', 'month', 'day', 'hour']).max()['review_date'].rename('day_count')
print(hour_count)
count = hour_day_count.groupby(['year', 'month', 'day']).sum()['review_date'].rename('day_sum')
count


hour_day_count = hour_day_count.join(count, on=['year', 'month', 'day'])
hour_day_count['percentage_annual'] = hour_day_count['review_date'] / hour_day_count['day_sum'] * 100
hour_day_count = hour_day_count.set_index(['year', 'month','day', 'hour'])['percentage_annual'].unstack(fill_value=0).stack().reset_index(name='percentage_annual')
hour_day_count['color'] = hour_day_count['percentage_annual'].apply(color_opposite)
# hour_day_count = hour_day_count.drop(columns = ['review_date', 'fandom_id', 'day_sum'])
hour_day_count.head(2)
year  month  day  hour
2000  4      11   7       30011
             12   7         278
             13   7         175
             14   7         270
             15   7         394
                          ...  
2017  2      11   3           4
                  4           1
                  5           1
                  7           3
                  16          1
Name: day_count, Length: 48978, dtype: int64
Out[81]:
year month day hour percentage_annual color
0 2000 4 11 0 0.0 black
1 2000 4 11 1 0.0 black
In [82]:
hour_day_count = hour_day_count.join(hour_count, on=['year', 'month', 'day', 'hour'])
hour_day_count.head(2)
Out[82]:
year month day hour percentage_annual color day_count
0 2000 4 11 0 0.0 black NaN
1 2000 4 11 1 0.0 black NaN
In [83]:
hour_day_count
Out[83]:
year month day hour percentage_annual color day_count
0 2000 4 11 0 0.0 black NaN
1 2000 4 11 1 0.0 black NaN
2 2000 4 11 2 0.0 black NaN
3 2000 4 11 3 0.0 black NaN
4 2000 4 11 4 0.0 black NaN
... ... ... ... ... ... ... ...
146659 2017 2 11 19 0.0 black NaN
146660 2017 2 11 20 0.0 black NaN
146661 2017 2 11 21 0.0 black NaN
146662 2017 2 11 22 0.0 black NaN
146663 2017 2 11 23 0.0 black NaN

146664 rows × 7 columns

Below I put all the multipart graph pieces together.

The graph is displayed at https://travisneils.github.io/dates/date_chart.html

In [84]:
single_year = alt.selection_single(on='click', fields=['year'], init={"year":2011})
single_month = alt.selection_single(on='click', fields=['month'], init={"month":12})
single_day = alt.selection_single(on='click', fields=['day'], init={"day":6})

alt.data_transformers.disable_max_rows()
color = alt.Chart(hour_year_count).mark_rect().encode(
    x = alt.X('hour:O', title = "Hour (UTC)", axis= alt.Axis(labelAngle=0)),
    y = alt.Y('year:O', title = "Year"),
    color = alt.condition(alt.datum.percentage_annual == 0, alt.value('lightgrey'), alt.Color('percentage_annual', title = None, legend=None))
)

text = alt.Chart(hour_year_count).mark_text(size = 10).encode(
    x = alt.X('hour:O', title = "Hour (UTC)"),
    y = alt.Y('year:O', title = "Year"),
    color= alt.Color('color', scale=None),
    # color = alt.Color('percentage_annual', scale=alt.Scale(domain = [15, 8], scheme='greys')),
    text = alt.Text('percentage_annual', format=".1f")
)

annual_chart = alt.layer(
    color, 
    text
).resolve_scale(
    color='independent'
).properties(
    width = 800,
    height = 300,
    title = "Hourly Review Distibution by Year"
).add_selection(
    single_year
)

color = alt.Chart(hour_month_count).mark_rect().encode(
    x = alt.X('hour:O', title = "Hour (UTC)", axis= alt.Axis(labelAngle=0), scale=alt.Scale(domain=list(range(0, 24)))),
    y = alt.Y('month:O', title = "Month"),
    color = alt.condition(alt.datum.percentage_annual == 0, alt.value('lightgrey'), alt.Color('percentage_annual', title = None, legend=None))
).transform_filter(
    single_year
)

text = alt.Chart(hour_month_count).mark_text(size = 10).encode(
    x = alt.X('hour:O', title = "Hour (UTC)", scale=alt.Scale(domain=list(range(0, 24)))),
    y = alt.Y('month:O', title = "Month"),
    color= alt.Color('color', scale=None),
    # color = alt.Color('percentage_annual', scale=alt.Scale(domain = [15, 8], scheme='greys')),
    text = alt.Text('percentage_annual', format=".1f")
).transform_filter(
    single_year
)

monthly_chart = alt.layer(
    color, 
    text
).resolve_scale(
    color='independent'
).properties(
    width = 800,
    height = 300,
    title = "Hourly Review Distibution by Month"
).add_selection(
    single_month
)

color = alt.Chart(hour_day_count).mark_rect().encode(
    x = alt.X('hour:O', title = "Hour (UTC)", axis= alt.Axis(labelAngle=0), scale=alt.Scale(domain=list(range(0, 24)))),
    y = alt.Y('day:O', title = "Day"),
    color = alt.condition(alt.datum.percentage_annual == 0, alt.value('lightgrey'), alt.Color('percentage_annual', title = None, legend=None))
).transform_filter(
    single_year & single_month
)

text = alt.Chart(hour_day_count).mark_text(size = 10).encode(
    x = alt.X('hour:O', title = "Hour (UTC)", scale=alt.Scale(domain=list(range(0, 24)))),
    y = alt.Y('day:O', title = "Day"),
    color= alt.Color('color', scale=None),
    # color = alt.Color('percentage_annual', scale=alt.Scale(domain = [15, 8], scheme='greys')),
    text = alt.Text('percentage_annual', format=".1f")
).transform_filter(
    single_year & single_month
)

daily_chart = alt.layer(
    color, 
    text
).resolve_scale(
    color='independent'
).properties(
    width = 800,
    height = 500,
    title = "Hourly Review Distibution by Day "
).add_selection(
    single_day
)

daily_hist = alt.Chart(hour_day_count).mark_bar().encode(
    x = alt.X('hour:O', title = "Hour (UTC)", scale=alt.Scale(domain=list(range(0, 24)))),
    y = alt.Y('day_count:Q', title= "Reviews Recieved")
).transform_filter(
    single_year & single_month & single_day
)

selected_year = alt.Chart(hour_month_count).mark_text().encode(
    text = alt.Text('min(year)')
).transform_filter(
    single_year
).properties(
    title="Selected Year:"
)

selected_month = alt.Chart(hour_day_count).mark_text().encode(
    text = alt.Text('min(month)')
).transform_filter(
    single_year & single_month
).properties(
    title="Selected Month:"
)

selected_day = alt.Chart(hour_day_count).mark_text().encode(
    text = alt.Text('min(day)')
).transform_filter(
    single_year & single_month & single_day
).properties(
    title="Selected Day:"
)

# altair_saver.save(monthly_chart, 'monthly.svg')

# altair_saver.save(annual_chart, 'annual.png')
interactive_chart = alt.vconcat(annual_chart,  selected_year,  monthly_chart,  selected_month, 
daily_chart, selected_day, daily_hist, center=True)
interactive_chart.save('date_chart.html')
# interactive_chart