This is how I created some Visualizations with altair in python.
import pandas as pd
import altair as alt
import altair_saver
Reading in our dataset, we will only use the first 50,000,000 rows so we don't have memory issues.
review_big = pd.read_csv('C:/Users/travis/Downloads/review_analysis.csv', usecols = ['review_date', 'fandom_id'], nrows = 50_000_000, engine ='c')
review_big.head(2)
review_big['review_date'] = pd.to_datetime(review_big['review_date'], unit = 's')
review_big['year'] = review_big['review_date'].dt.year
review_big['hour'] = review_big['review_date'].dt.hour
review_big['month'] = review_big['review_date'].dt.month
review_big['day'] = review_big['review_date'].dt.day
Below I pick a some fandoms that are all similar size and show different severities of the problem.
fandom_one = review_big[review_big['fandom_id'].isin([2, 5, 6, 27])].copy()
fandom_one.fandom_id.value_counts()
fandom_one_dist = fandom_one.groupby(['fandom_id', 'hour']).count().reset_index()
fandom_one_dist
fandom_one_vis = alt.Chart(fandom_one_dist).mark_bar().encode(
x = alt.X('hour:O', title = "Hour (UTC)", axis= alt.Axis(labelAngle=0)),
y = alt.Y('year:Q', title = "Review Count"),
column = alt.Column('fandom_id:O', title = "Fandom ID")
).properties(
title="Reviews by Hour of Day for Various Fandoms",
width=200,
height = 200
)
altair_saver.save(fandom_one_vis, 'initial.png', scale_factor = 5)
fandom_one_vis
Now we will move on to exploring the distribution across years.
hour_year_count = review_big.groupby(['hour', 'year']).count()
hour_year_count = hour_year_count.reset_index()
count = hour_year_count.groupby('year').sum()['review_date'].rename('year_sum')
hour_year_count = hour_year_count.join(count, on='year')
hour_year_count.head(3)
hour_year_count['percentage_annual'] = hour_year_count['review_date'] / hour_year_count['year_sum'] * 100
hour_year_count = hour_year_count[hour_year_count['year'] > 1999].copy()
def color_opposite(percentage):
if percentage > 10:
return "white"
return "black"
hour_year_count = hour_year_count.set_index(['year', 'hour'])['percentage_annual'].unstack(fill_value=0).stack().reset_index(name='percentage_annual')
hour_year_count['color'] = hour_year_count['percentage_annual'].apply(color_opposite)
# hour_year_count['color'] = hour_year_count['percentage_annual'].apply(color_opposite)
hour_year_count.head(3)
color = alt.Chart(hour_year_count).mark_rect().encode(
x = alt.X('hour:O', title = "Hour (UTC)", axis= alt.Axis(labelAngle=0)),
y = alt.Y('year:O', title = "Year"),
color = alt.condition(alt.datum.percentage_annual == 0, alt.value('lightgrey'), alt.Color('percentage_annual', title = None, legend=None))
)
text = alt.Chart(hour_year_count).mark_text().encode(
x = alt.X('hour:O', title = "Hour (UTC)"),
y = alt.Y('year:O', title = "Year"),
color= alt.Color('color', scale=None),
# color = alt.Color('percentage_annual', scale=alt.Scale(domain = [15, 8], scheme='greys')),
text = alt.Text('percentage_annual', format=".1f")
)
annual_chart = alt.layer(
color,
text
).resolve_scale(
color='independent'
).properties(
width = 800,
height = 400,
title = "Hourly Review Distibution by Year"
)
altair_saver.save(annual_chart, 'annual.png', scale_factor = 5)
annual_chart
Now we will look at the distribution by month!
hour_month_count = review_big[review_big['year'] == 2011].groupby(['hour', 'month']).count()
hour_month_count = hour_month_count.reset_index()
count = hour_month_count.groupby('month').sum()['review_date'].rename('month_sum')
count
hour_month_count = hour_month_count.join(count, on='month')
hour_month_count['percentage_annual'] = hour_month_count['review_date'] / hour_month_count['month_sum'] * 100
hour_month_count = hour_month_count.set_index(['month', 'hour'])['percentage_annual'].unstack(fill_value=0).stack().reset_index(name='percentage_annual')
hour_month_count['color'] = hour_month_count['percentage_annual'].apply(color_opposite)
hour_month_count.head(2)
color = alt.Chart(hour_month_count).mark_rect().encode(
x = alt.X('hour:O', title = "Hour (UTC)", axis= alt.Axis(labelAngle=0)),
y = alt.Y('month:O', title = "Month"),
color = alt.condition(alt.datum.percentage_annual == 0, alt.value('lightgrey'), alt.Color('percentage_annual', title = None, legend=None))
)
text = alt.Chart(hour_month_count).mark_text().encode(
x = alt.X('hour:O', title = "Hour (UTC)"),
y = alt.Y('month:O', title = "Month"),
color= alt.Color('color', scale=None),
# color = alt.Color('percentage_annual', scale=alt.Scale(domain = [15, 8], scheme='greys')),
text = alt.Text('percentage_annual', format=".1f")
)
monthly_chart = alt.layer(
color,
text
).resolve_scale(
color='independent'
).properties(
width = 800,
height = 300,
title = "Hourly Review Distibution by Month in 2011"
)
altair_saver.save(monthly_chart, 'monthly.png', scale_factor = 5)
monthly_chart
Now that I have created the three visualisation to find the problem, I will build an interactive version that ties them all together and facilitates further exploration. First I will have to prepare each the data. For each graph I will have one dataframe that includes values for each possible selection the graph could have (the month will have months for all years). Then I will use filters to hide all the months that aren't selected.
hour_month_count = review_big[review_big['year'] > 1999].copy().groupby(['year', 'month', 'hour' ]).count()
hour_month_count = hour_month_count.reset_index()
count = hour_month_count.groupby(['year', 'month']).sum()['review_date'].rename('month_sum')
count
hour_month_count = hour_month_count.join(count, on=['year', 'month'])
hour_month_count['percentage_annual'] = hour_month_count['review_date'] / hour_month_count['month_sum'] * 100
hour_month_count = hour_month_count.set_index(['year', 'month', 'hour'])['percentage_annual'].unstack(fill_value=0).stack().reset_index(name='percentage_annual')
hour_month_count['color'] = hour_month_count['percentage_annual'].apply(color_opposite)
hour_month_count.head(2)
hour_day_count = review_big[review_big['year'] > 1999].copy().groupby(['year', 'month', 'day', 'hour' ]).count()
hour_day_count = hour_day_count.reset_index()
hour_count = hour_day_count.groupby(['year', 'month', 'day', 'hour']).max()['review_date'].rename('day_count')
print(hour_count)
count = hour_day_count.groupby(['year', 'month', 'day']).sum()['review_date'].rename('day_sum')
count
hour_day_count = hour_day_count.join(count, on=['year', 'month', 'day'])
hour_day_count['percentage_annual'] = hour_day_count['review_date'] / hour_day_count['day_sum'] * 100
hour_day_count = hour_day_count.set_index(['year', 'month','day', 'hour'])['percentage_annual'].unstack(fill_value=0).stack().reset_index(name='percentage_annual')
hour_day_count['color'] = hour_day_count['percentage_annual'].apply(color_opposite)
# hour_day_count = hour_day_count.drop(columns = ['review_date', 'fandom_id', 'day_sum'])
hour_day_count.head(2)
hour_day_count = hour_day_count.join(hour_count, on=['year', 'month', 'day', 'hour'])
hour_day_count.head(2)
hour_day_count
Below I put all the multipart graph pieces together.
The graph is displayed at https://travisneils.github.io/dates/date_chart.html
single_year = alt.selection_single(on='click', fields=['year'], init={"year":2011})
single_month = alt.selection_single(on='click', fields=['month'], init={"month":12})
single_day = alt.selection_single(on='click', fields=['day'], init={"day":6})
alt.data_transformers.disable_max_rows()
color = alt.Chart(hour_year_count).mark_rect().encode(
x = alt.X('hour:O', title = "Hour (UTC)", axis= alt.Axis(labelAngle=0)),
y = alt.Y('year:O', title = "Year"),
color = alt.condition(alt.datum.percentage_annual == 0, alt.value('lightgrey'), alt.Color('percentage_annual', title = None, legend=None))
)
text = alt.Chart(hour_year_count).mark_text(size = 10).encode(
x = alt.X('hour:O', title = "Hour (UTC)"),
y = alt.Y('year:O', title = "Year"),
color= alt.Color('color', scale=None),
# color = alt.Color('percentage_annual', scale=alt.Scale(domain = [15, 8], scheme='greys')),
text = alt.Text('percentage_annual', format=".1f")
)
annual_chart = alt.layer(
color,
text
).resolve_scale(
color='independent'
).properties(
width = 800,
height = 300,
title = "Hourly Review Distibution by Year"
).add_selection(
single_year
)
color = alt.Chart(hour_month_count).mark_rect().encode(
x = alt.X('hour:O', title = "Hour (UTC)", axis= alt.Axis(labelAngle=0), scale=alt.Scale(domain=list(range(0, 24)))),
y = alt.Y('month:O', title = "Month"),
color = alt.condition(alt.datum.percentage_annual == 0, alt.value('lightgrey'), alt.Color('percentage_annual', title = None, legend=None))
).transform_filter(
single_year
)
text = alt.Chart(hour_month_count).mark_text(size = 10).encode(
x = alt.X('hour:O', title = "Hour (UTC)", scale=alt.Scale(domain=list(range(0, 24)))),
y = alt.Y('month:O', title = "Month"),
color= alt.Color('color', scale=None),
# color = alt.Color('percentage_annual', scale=alt.Scale(domain = [15, 8], scheme='greys')),
text = alt.Text('percentage_annual', format=".1f")
).transform_filter(
single_year
)
monthly_chart = alt.layer(
color,
text
).resolve_scale(
color='independent'
).properties(
width = 800,
height = 300,
title = "Hourly Review Distibution by Month"
).add_selection(
single_month
)
color = alt.Chart(hour_day_count).mark_rect().encode(
x = alt.X('hour:O', title = "Hour (UTC)", axis= alt.Axis(labelAngle=0), scale=alt.Scale(domain=list(range(0, 24)))),
y = alt.Y('day:O', title = "Day"),
color = alt.condition(alt.datum.percentage_annual == 0, alt.value('lightgrey'), alt.Color('percentage_annual', title = None, legend=None))
).transform_filter(
single_year & single_month
)
text = alt.Chart(hour_day_count).mark_text(size = 10).encode(
x = alt.X('hour:O', title = "Hour (UTC)", scale=alt.Scale(domain=list(range(0, 24)))),
y = alt.Y('day:O', title = "Day"),
color= alt.Color('color', scale=None),
# color = alt.Color('percentage_annual', scale=alt.Scale(domain = [15, 8], scheme='greys')),
text = alt.Text('percentage_annual', format=".1f")
).transform_filter(
single_year & single_month
)
daily_chart = alt.layer(
color,
text
).resolve_scale(
color='independent'
).properties(
width = 800,
height = 500,
title = "Hourly Review Distibution by Day "
).add_selection(
single_day
)
daily_hist = alt.Chart(hour_day_count).mark_bar().encode(
x = alt.X('hour:O', title = "Hour (UTC)", scale=alt.Scale(domain=list(range(0, 24)))),
y = alt.Y('day_count:Q', title= "Reviews Recieved")
).transform_filter(
single_year & single_month & single_day
)
selected_year = alt.Chart(hour_month_count).mark_text().encode(
text = alt.Text('min(year)')
).transform_filter(
single_year
).properties(
title="Selected Year:"
)
selected_month = alt.Chart(hour_day_count).mark_text().encode(
text = alt.Text('min(month)')
).transform_filter(
single_year & single_month
).properties(
title="Selected Month:"
)
selected_day = alt.Chart(hour_day_count).mark_text().encode(
text = alt.Text('min(day)')
).transform_filter(
single_year & single_month & single_day
).properties(
title="Selected Day:"
)
# altair_saver.save(monthly_chart, 'monthly.svg')
# altair_saver.save(annual_chart, 'annual.png')
interactive_chart = alt.vconcat(annual_chart, selected_year, monthly_chart, selected_month,
daily_chart, selected_day, daily_hist, center=True)
interactive_chart.save('date_chart.html')
# interactive_chart