library(tidyverse)
library(plotly)
library(RColorBrewer)
library(lme4)
library(plm)
library(magrittr)
library(knitr)
setwd("~/Github/Systembolaget")
Systembolaget mining
Legal Disclaimer
The robots.txt did not forbid the techniques used for gathering the data.
Load Libraries
Load Data
# load alcoholic datasets
<- read.csv("data/ol.csv")
df_ol <- read.csv("data/cider.csv")
df_cider <- read.csv("data/sprit.csv")
df_sprit <- read.csv("data/vin.csv")
df_vin
# bind all in one dataset
<- rbind(df_ol, df_cider, df_sprit, df_vin)
df
# remove unused objects
rm(df_ol, df_cider, df_sprit, df_vin)
# clean/prepare dataset
<- df %>%
df mutate(price_p_volume = price / volume_ml,
alc_per_price = alcohol_percent / price_p_volume,
category_1 = as.factor(category_1))
head(df) %>%
kable()
category_group | product_id | product_name | product_number | category_1 | category_2 | category_3 | country | producer | alcohol_percent | volume_ml | price | comparison_price | taste | package | launch_date | is_out_of_stock | price_p_volume | alc_per_price |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
ol | 1000155 | Midas Golden Pilsner | 8936603 | Öl | Ljus lager | Pilsner - tjeckisk stil | Sverige | Imperiebryggeriet | 4.9 | 330 | 32.8 | 99.39 | Maltig smak med inslag av ljust bröd, apelsinskal, örter och aprikos. | Flaska | 2020-09-01T00:00:00 | FALSE | 0.0993939 | 49.29878 |
ol | 1000432 | Schlappeseppel Specialität | 8983803 | Öl | Mellanmörk & Mörk lager | Märzen och wienerstil | Tyskland | Schlappeseppel GmbH | 5.6 | 330 | 35.9 | 108.79 | NA | Flaska | 2015-09-01T00:00:00 | FALSE | 0.1087879 | 51.47632 |
ol | 1000438 | Schlappeseppel | 8998601 | Öl | Veteöl | Hefeweizen | Tyskland | Schlappeseppel GmbH | 5.5 | 500 | 35.1 | 70.20 | NA | Flaska | 2015-09-01T00:00:00 | FALSE | 0.0702000 | 78.34758 |
ol | 1000441 | Schlappeseppel | 8993603 | Öl | Ljus lager | Pilsner - tysk stil | Tyskland | Schlappeseppel GmbH | 5.1 | 330 | 26.1 | 79.09 | NA | Flaska | 2015-09-01T00:00:00 | FALSE | 0.0790909 | 64.48276 |
ol | 1000444 | Schlappeseppel | 8962103 | Öl | Ljus lager | Zwickel, keller- och landbier | Tyskland | Schlappeseppel GmbH | 5.5 | 330 | 27.1 | 82.12 | Maltig smak med inslag av ljust bröd, timjan och apelsinskal. | Flaska | 2015-09-01T00:00:00 | FALSE | 0.0821212 | 66.97417 |
ol | 1001367 | This Ale Is India Pale | 3064703 | Öl | Ale | India pale ale (IPA) | Sverige | Skelderwikens Brygghus | 5.8 | 330 | 29.4 | 89.09 | Fruktig, humlearomatisk smak med tydlig beska inslag av grapefruktskal, halm, tallbarr, aprikos och tobak. | Flaska | 2015-10-01T00:00:00 | FALSE | 0.0890909 | 65.10204 |
Products
Barplot with best Alcohol / Price Ratio
%>%
df mutate(price_p_volume = price / volume_ml) %>%
mutate(alc_per_price = alcohol_percent / price_p_volume) %>%
arrange(desc(alc_per_price)) %>%
head(10) %>%
ggplot(aes(x = reorder(product_name, alc_per_price), y = alc_per_price, fill = category_1)) +
geom_bar(stat = "identity") +
geom_text(aes(label = paste0("Alcohol: ", round(alcohol_percent, 1), "%\nPrice: ", round(price, 2))),
position = position_stack(vjust = 0.5), size = 3) +
coord_flip() +
labs(title = "Top 10 Systembolaget Products by Alcohol Content per Price",
x = "Product",
y = "Alcohol % / Price per Volume",
fill = "Category") +
theme_minimal() +
theme(legend.position = "right")
Code
Best Beer
%>%
df filter(category_1 == "Öl") %>%
arrange(price_p_volume) %>%
head(5) %>%
ggplot(aes(x = reorder(product_id, price_p_volume), y = price_p_volume)) +
geom_bar(stat = "identity", fill = "#1f78b4") +
geom_text(aes(label = paste0(product_name, "\nVolume: ", volume_ml, "ml\nPrice: ", round(price, 2))),
position = position_stack(vjust = 0.5), size = 3.5) +
coord_flip() +
labs(title = "Top 5 Beers Price per Volume",
x = "Product",
y = "Price per Volume",
fill = "Category") +
theme_classic() +
theme(legend.position = "right")
Deeper Price - Alcohol Analysis
Distribution of Alcohol/Price
%>%
df ggplot(aes(x = log(price))) +
geom_histogram(binwidth = 0.2, color = "white", fill = "#1f78b4", alpha = 0.8) +
labs(title = "Distribution of Product Prices",
x = "Log(Price)",
y = "Frequency") +
theme_minimal()
Code
%>%
df ggplot(aes(x = log(price), fill = category_1)) +
geom_density(alpha = 0.6) +
labs(title = "Distribution of Product Prices by Category",
x = "Log(Log(Price))",
y = "Density",
fill = "Product Category") +
theme_minimal() +
scale_fill_brewer(palette = "Set2") +
theme(legend.position = "bottom",
legend.direction = "horizontal")
Code
OLS with Interaction
Call:
lm(formula = log(price) ~ 1 + alcohol_percent + category_1 +
(alcohol_percent * category_1), data = df)
Residuals:
Min 1Q Median 3Q Max
-2.3338 -0.4547 -0.1229 0.3033 6.8641
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.24607 0.13775 16.306 < 2e-16 ***
alcohol_percent 0.27538 0.02516 10.945 < 2e-16 ***
category_1Öl 0.49831 0.14395 3.462 0.000537 ***
category_1Sprit 2.16709 0.14590 14.853 < 2e-16 ***
category_1Vin 2.53985 0.14607 17.387 < 2e-16 ***
alcohol_percent:category_1Öl -0.12942 0.02596 -4.985 6.23e-07 ***
alcohol_percent:category_1Sprit -0.22834 0.02518 -9.067 < 2e-16 ***
alcohol_percent:category_1Vin -0.20726 0.02542 -8.153 3.71e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.7942 on 24903 degrees of freedom
Multiple R-squared: 0.577, Adjusted R-squared: 0.5769
F-statistic: 4852 on 7 and 24903 DF, p-value: < 2.2e-16
lm(data=df,
log(price) ~ 1 + alcohol_percent + category_1 + (alcohol_percent*category_1)) %>%
summary()
Code
%>%
df mutate(predicted = predict(lm(log(price) ~ 1 + alcohol_percent + category_1 + alcohol_percent:category_1, data = df))) %>%
ggplot(aes(x = alcohol_percent, y = predicted, color = category_1)) +
geom_point(aes(y = log(price)), size = 2, alpha = 0.08) +
geom_line(size = 2, alpha = 1) +
labs(
title = "Interaction Effect of Alcohol Percent and Category on Log(Price)",
x = "Alcohol Percent",
y = "Predicted Log(Price)",
color = "Category"
+
) theme_minimal()
Code
Fixed Effect Model
Call:
lm(formula = log(price) ~ 1 + alcohol_percent, data = df)
Residuals:
Min 1Q Median 3Q Max
-2.6598 -0.6514 -0.0166 0.5375 6.5906
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.5752080 0.0108671 421.0 <2e-16 ***
alcohol_percent 0.0496754 0.0004863 102.2 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.025 on 24909 degrees of freedom
Multiple R-squared: 0.2952, Adjusted R-squared: 0.2952
F-statistic: 1.043e+04 on 1 and 24909 DF, p-value: < 2.2e-16
lm(formula = log(price) ~ 1 + alcohol_percent,
data = df) %>%
summary()
Code
Oneway (individual) effect Within Model
Call:
plm(formula = log(price) ~ 1 + alcohol_percent, data = df, model = "within",
index = "category_1")
Unbalanced Panel: n = 4, T = 412-15189, N = 24911
Residuals:
Min. 1st Qu. Median 3rd Qu. Max.
-2.44806 -0.46050 -0.13303 0.31490 6.86099
Coefficients:
Estimate Std. Error t-value Pr(>|t|)
alcohol_percent 0.0519468 0.0010669 48.688 < 2.2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Total Sum of Squares: 17433
Residual Sum of Squares: 15918
R-Squared: 0.086906
Adj. R-Squared: 0.086759
F-statistic: 2370.49 on 1 and 24906 DF, p-value: < 2.22e-16
plm(formula = log(price) ~ 1 + alcohol_percent,
index = "category_1",
model = "within",
data = df) %>%
summary()
Code
%>%
df group_by(category_1) %>%
mutate(mean_alcohol_percent = mean(alcohol_percent)) %>%
mutate(mean_log_price = mean(log(price))) %>%
ungroup() %>%
mutate(normalized_alcohol_percent = alcohol_percent - mean_alcohol_percent) %>%
mutate(normalized_log_price = log(price) - mean_log_price) %>%
ggplot(aes(x = normalized_alcohol_percent, y = log(price), color = category_1)) +
geom_point(alpha=0.2) + # Standard scatter plot
labs(title = "Scatterplot of normalized log(price) vs normalized alcohol_percent",
x = "Normalized Alcohol Percent",
y = "Normalized Log(Price)") +
theme_minimal()