library(tidyverse)
library(plotly)
library(RColorBrewer)
library(lme4)
library(plm)
library(magrittr)
library(knitr)
setwd("~/Github/Systembolaget")Systembolaget mining
Legal Disclaimer
The robots.txt did not forbid the techniques used for gathering the data.
Load Libraries
Load Data
# load alcoholic datasets
df_ol <- read.csv("data/ol.csv")
df_cider <- read.csv("data/cider.csv")
df_sprit <- read.csv("data/sprit.csv")
df_vin <- read.csv("data/vin.csv")
# bind all in one dataset
df <- rbind(df_ol, df_cider, df_sprit, df_vin)
# remove unused objects
rm(df_ol, df_cider, df_sprit, df_vin)
# clean/prepare dataset
df <- df %>% 
  mutate(price_p_volume = price / volume_ml,
         alc_per_price = alcohol_percent / price_p_volume,
         category_1 = as.factor(category_1))head(df) %>% 
  kable()| category_group | product_id | product_name | product_number | category_1 | category_2 | category_3 | country | producer | alcohol_percent | volume_ml | price | comparison_price | taste | package | launch_date | is_out_of_stock | price_p_volume | alc_per_price | 
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ol | 1000155 | Midas Golden Pilsner | 8936603 | Öl | Ljus lager | Pilsner - tjeckisk stil | Sverige | Imperiebryggeriet | 4.9 | 330 | 32.8 | 99.39 | Maltig smak med inslag av ljust bröd, apelsinskal, örter och aprikos. | Flaska | 2020-09-01T00:00:00 | FALSE | 0.0993939 | 49.29878 | 
| ol | 1000432 | Schlappeseppel Specialität | 8983803 | Öl | Mellanmörk & Mörk lager | Märzen och wienerstil | Tyskland | Schlappeseppel GmbH | 5.6 | 330 | 35.9 | 108.79 | NA | Flaska | 2015-09-01T00:00:00 | FALSE | 0.1087879 | 51.47632 | 
| ol | 1000438 | Schlappeseppel | 8998601 | Öl | Veteöl | Hefeweizen | Tyskland | Schlappeseppel GmbH | 5.5 | 500 | 35.1 | 70.20 | NA | Flaska | 2015-09-01T00:00:00 | FALSE | 0.0702000 | 78.34758 | 
| ol | 1000441 | Schlappeseppel | 8993603 | Öl | Ljus lager | Pilsner - tysk stil | Tyskland | Schlappeseppel GmbH | 5.1 | 330 | 26.1 | 79.09 | NA | Flaska | 2015-09-01T00:00:00 | FALSE | 0.0790909 | 64.48276 | 
| ol | 1000444 | Schlappeseppel | 8962103 | Öl | Ljus lager | Zwickel, keller- och landbier | Tyskland | Schlappeseppel GmbH | 5.5 | 330 | 27.1 | 82.12 | Maltig smak med inslag av ljust bröd, timjan och apelsinskal. | Flaska | 2015-09-01T00:00:00 | FALSE | 0.0821212 | 66.97417 | 
| ol | 1001367 | This Ale Is India Pale | 3064703 | Öl | Ale | India pale ale (IPA) | Sverige | Skelderwikens Brygghus | 5.8 | 330 | 29.4 | 89.09 | Fruktig, humlearomatisk smak med tydlig beska inslag av grapefruktskal, halm, tallbarr, aprikos och tobak. | Flaska | 2015-10-01T00:00:00 | FALSE | 0.0890909 | 65.10204 | 
Products
Barplot with best Alcohol / Price Ratio
df %>% 
  mutate(price_p_volume = price / volume_ml) %>% 
  mutate(alc_per_price = alcohol_percent / price_p_volume) %>% 
  arrange(desc(alc_per_price)) %>% 
  head(10) %>%
  ggplot(aes(x = reorder(product_name, alc_per_price), y = alc_per_price, fill = category_1)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = paste0("Alcohol: ", round(alcohol_percent, 1), "%\nPrice: ", round(price, 2))), 
            position = position_stack(vjust = 0.5), size = 3) +
  coord_flip() +
  labs(title = "Top 10 Systembolaget Products by Alcohol Content per Price",
       x = "Product",
       y = "Alcohol % / Price per Volume",
       fill = "Category") +
  theme_minimal() +
  theme(legend.position = "right")Code
Best Beer
df %>% 
  filter(category_1 == "Öl") %>% 
  arrange(price_p_volume) %>% 
  head(5) %>%
  ggplot(aes(x = reorder(product_id, price_p_volume), y = price_p_volume)) +
  geom_bar(stat = "identity", fill = "#1f78b4") +
  geom_text(aes(label = paste0(product_name, "\nVolume: ", volume_ml, "ml\nPrice: ", round(price, 2))), 
            position = position_stack(vjust = 0.5), size = 3.5) +
  coord_flip() +
  labs(title = "Top 5 Beers Price per Volume",
       x = "Product",
       y = "Price per Volume",
       fill = "Category") +
  theme_classic() +
  theme(legend.position = "right")Deeper Price - Alcohol Analysis
Distribution of Alcohol/Price
df %>% 
  ggplot(aes(x = log(price))) +
  geom_histogram(binwidth = 0.2, color = "white", fill = "#1f78b4", alpha = 0.8) +
  labs(title = "Distribution of Product Prices",
       x = "Log(Price)",
       y = "Frequency") +
  theme_minimal()Code
df %>% 
  ggplot(aes(x = log(price), fill = category_1)) +
  geom_density(alpha = 0.6) +
  labs(title = "Distribution of Product Prices by Category",
       x = "Log(Log(Price))",
       y = "Density",
       fill = "Product Category") +
  theme_minimal() +
  scale_fill_brewer(palette = "Set2") + 
  theme(legend.position = "bottom",
        legend.direction = "horizontal")Code
OLS with Interaction
Call:
lm(formula = log(price) ~ 1 + alcohol_percent + category_1 + 
    (alcohol_percent * category_1), data = df)
Residuals:
    Min      1Q  Median      3Q     Max 
-2.3338 -0.4547 -0.1229  0.3033  6.8641 
Coefficients:
                                Estimate Std. Error t value Pr(>|t|)    
(Intercept)                      2.24607    0.13775  16.306  < 2e-16 ***
alcohol_percent                  0.27538    0.02516  10.945  < 2e-16 ***
category_1Öl                     0.49831    0.14395   3.462 0.000537 ***
category_1Sprit                  2.16709    0.14590  14.853  < 2e-16 ***
category_1Vin                    2.53985    0.14607  17.387  < 2e-16 ***
alcohol_percent:category_1Öl    -0.12942    0.02596  -4.985 6.23e-07 ***
alcohol_percent:category_1Sprit -0.22834    0.02518  -9.067  < 2e-16 ***
alcohol_percent:category_1Vin   -0.20726    0.02542  -8.153 3.71e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.7942 on 24903 degrees of freedom
Multiple R-squared:  0.577, Adjusted R-squared:  0.5769 
F-statistic:  4852 on 7 and 24903 DF,  p-value: < 2.2e-16
lm(data=df,
   log(price) ~ 1 + alcohol_percent + category_1 + (alcohol_percent*category_1)) %>% 
  summary()Code
df %>%
  mutate(predicted = predict(lm(log(price) ~ 1 + alcohol_percent + category_1 + alcohol_percent:category_1, data = df))) %>% 
  ggplot(aes(x = alcohol_percent, y = predicted, color = category_1)) +
  geom_point(aes(y = log(price)), size = 2, alpha = 0.08) +
  geom_line(size = 2, alpha = 1) +
  labs(
    title = "Interaction Effect of Alcohol Percent and Category on Log(Price)",
    x = "Alcohol Percent",
    y = "Predicted Log(Price)",
    color = "Category"
  ) +
  theme_minimal()Code
Fixed Effect Model
Call:
lm(formula = log(price) ~ 1 + alcohol_percent, data = df)
Residuals:
    Min      1Q  Median      3Q     Max 
-2.6598 -0.6514 -0.0166  0.5375  6.5906 
Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)     4.5752080  0.0108671   421.0   <2e-16 ***
alcohol_percent 0.0496754  0.0004863   102.2   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.025 on 24909 degrees of freedom
Multiple R-squared:  0.2952,    Adjusted R-squared:  0.2952 
F-statistic: 1.043e+04 on 1 and 24909 DF,  p-value: < 2.2e-16
lm(formula = log(price) ~ 1 + alcohol_percent,
   data = df) %>% 
  summary()Code
Oneway (individual) effect Within Model
Call:
plm(formula = log(price) ~ 1 + alcohol_percent, data = df, model = "within", 
    index = "category_1")
Unbalanced Panel: n = 4, T = 412-15189, N = 24911
Residuals:
    Min.  1st Qu.   Median  3rd Qu.     Max. 
-2.44806 -0.46050 -0.13303  0.31490  6.86099 
Coefficients:
                 Estimate Std. Error t-value  Pr(>|t|)    
alcohol_percent 0.0519468  0.0010669  48.688 < 2.2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Total Sum of Squares:    17433
Residual Sum of Squares: 15918
R-Squared:      0.086906
Adj. R-Squared: 0.086759
F-statistic: 2370.49 on 1 and 24906 DF, p-value: < 2.22e-16
plm(formula = log(price) ~ 1 + alcohol_percent,
    index = "category_1",
    model = "within",
    data = df) %>% 
summary()Code
df %>%
  group_by(category_1) %>%
  mutate(mean_alcohol_percent = mean(alcohol_percent)) %>%
  mutate(mean_log_price = mean(log(price))) %>% 
  ungroup() %>% 
  mutate(normalized_alcohol_percent = alcohol_percent - mean_alcohol_percent) %>% 
  mutate(normalized_log_price = log(price) - mean_log_price) %>%
  ggplot(aes(x = normalized_alcohol_percent, y = log(price), color = category_1)) +
  geom_point(alpha=0.2) +  # Standard scatter plot
  labs(title = "Scatterplot of normalized log(price) vs normalized alcohol_percent",
       x = "Normalized Alcohol Percent",
       y = "Normalized Log(Price)") +
  theme_minimal()