Systembolaget mining

Author

Thomas Haase

Published

February 27, 2025

The robots.txt did not forbid the techniques used for gathering the data.

Load Libraries

library(tidyverse)
library(plotly)
library(RColorBrewer)
library(lme4)
library(plm)
library(magrittr)
library(knitr)


setwd("~/Github/Systembolaget")

Load Data

# load alcoholic datasets
df_ol <- read.csv("data/ol.csv")
df_cider <- read.csv("data/cider.csv")
df_sprit <- read.csv("data/sprit.csv")
df_vin <- read.csv("data/vin.csv")

# bind all in one dataset
df <- rbind(df_ol, df_cider, df_sprit, df_vin)

# remove unused objects
rm(df_ol, df_cider, df_sprit, df_vin)


# clean/prepare dataset
df <- df %>% 
  mutate(price_p_volume = price / volume_ml,
         alc_per_price = alcohol_percent / price_p_volume,
         category_1 = as.factor(category_1))
head(df) %>% 
  kable()
Table 1: Summary of scraped data
category_group product_id product_name product_number category_1 category_2 category_3 country producer alcohol_percent volume_ml price comparison_price taste package launch_date is_out_of_stock price_p_volume alc_per_price
ol 1000155 Midas Golden Pilsner 8936603 Öl Ljus lager Pilsner - tjeckisk stil Sverige Imperiebryggeriet 4.9 330 32.8 99.39 Maltig smak med inslag av ljust bröd, apelsinskal, örter och aprikos. Flaska 2020-09-01T00:00:00 FALSE 0.0993939 49.29878
ol 1000432 Schlappeseppel Specialität 8983803 Öl Mellanmörk & Mörk lager Märzen och wienerstil Tyskland Schlappeseppel GmbH 5.6 330 35.9 108.79 NA Flaska 2015-09-01T00:00:00 FALSE 0.1087879 51.47632
ol 1000438 Schlappeseppel 8998601 Öl Veteöl Hefeweizen Tyskland Schlappeseppel GmbH 5.5 500 35.1 70.20 NA Flaska 2015-09-01T00:00:00 FALSE 0.0702000 78.34758
ol 1000441 Schlappeseppel 8993603 Öl Ljus lager Pilsner - tysk stil Tyskland Schlappeseppel GmbH 5.1 330 26.1 79.09 NA Flaska 2015-09-01T00:00:00 FALSE 0.0790909 64.48276
ol 1000444 Schlappeseppel 8962103 Öl Ljus lager Zwickel, keller- och landbier Tyskland Schlappeseppel GmbH 5.5 330 27.1 82.12 Maltig smak med inslag av ljust bröd, timjan och apelsinskal. Flaska 2015-09-01T00:00:00 FALSE 0.0821212 66.97417
ol 1001367 This Ale Is India Pale 3064703 Öl Ale India pale ale (IPA) Sverige Skelderwikens Brygghus 5.8 330 29.4 89.09 Fruktig, humlearomatisk smak med tydlig beska inslag av grapefruktskal, halm, tallbarr, aprikos och tobak. Flaska 2015-10-01T00:00:00 FALSE 0.0890909 65.10204

Products

Barplot with best Alcohol / Price Ratio

df %>% 
  mutate(price_p_volume = price / volume_ml) %>% 
  mutate(alc_per_price = alcohol_percent / price_p_volume) %>% 
  arrange(desc(alc_per_price)) %>% 
  head(10) %>%
  ggplot(aes(x = reorder(product_name, alc_per_price), y = alc_per_price, fill = category_1)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = paste0("Alcohol: ", round(alcohol_percent, 1), "%\nPrice: ", round(price, 2))), 
            position = position_stack(vjust = 0.5), size = 3) +
  coord_flip() +
  labs(title = "Top 10 Systembolaget Products by Alcohol Content per Price",
       x = "Product",
       y = "Alcohol % / Price per Volume",
       fill = "Category") +
  theme_minimal() +
  theme(legend.position = "right")
Code

Best Beer

df %>% 
  filter(category_1 == "Öl") %>% 
  arrange(price_p_volume) %>% 
  head(5) %>%
  ggplot(aes(x = reorder(product_id, price_p_volume), y = price_p_volume)) +
  geom_bar(stat = "identity", fill = "#1f78b4") +
  geom_text(aes(label = paste0(product_name, "\nVolume: ", volume_ml, "ml\nPrice: ", round(price, 2))), 
            position = position_stack(vjust = 0.5), size = 3.5) +
  coord_flip() +
  labs(title = "Top 5 Beers Price per Volume",
       x = "Product",
       y = "Price per Volume",
       fill = "Category") +
  theme_classic() +
  theme(legend.position = "right")


Deeper Price - Alcohol Analysis

Distribution of Alcohol/Price

df %>% 
  ggplot(aes(x = log(price))) +
  geom_histogram(binwidth = 0.2, color = "white", fill = "#1f78b4", alpha = 0.8) +
  labs(title = "Distribution of Product Prices",
       x = "Log(Price)",
       y = "Frequency") +
  theme_minimal()
Code

df %>% 
  ggplot(aes(x = log(price), fill = category_1)) +
  geom_density(alpha = 0.6) +
  labs(title = "Distribution of Product Prices by Category",
       x = "Log(Log(Price))",
       y = "Density",
       fill = "Product Category") +
  theme_minimal() +
  scale_fill_brewer(palette = "Set2") + 
  theme(legend.position = "bottom",
        legend.direction = "horizontal")
Code

OLS with Interaction


Call:
lm(formula = log(price) ~ 1 + alcohol_percent + category_1 + 
    (alcohol_percent * category_1), data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-2.3338 -0.4547 -0.1229  0.3033  6.8641 

Coefficients:
                                Estimate Std. Error t value Pr(>|t|)    
(Intercept)                      2.24607    0.13775  16.306  < 2e-16 ***
alcohol_percent                  0.27538    0.02516  10.945  < 2e-16 ***
category_1Öl                     0.49831    0.14395   3.462 0.000537 ***
category_1Sprit                  2.16709    0.14590  14.853  < 2e-16 ***
category_1Vin                    2.53985    0.14607  17.387  < 2e-16 ***
alcohol_percent:category_1Öl    -0.12942    0.02596  -4.985 6.23e-07 ***
alcohol_percent:category_1Sprit -0.22834    0.02518  -9.067  < 2e-16 ***
alcohol_percent:category_1Vin   -0.20726    0.02542  -8.153 3.71e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.7942 on 24903 degrees of freedom
Multiple R-squared:  0.577, Adjusted R-squared:  0.5769 
F-statistic:  4852 on 7 and 24903 DF,  p-value: < 2.2e-16
lm(data=df,
   log(price) ~ 1 + alcohol_percent + category_1 + (alcohol_percent*category_1)) %>% 
  summary()
Code

df %>%
  mutate(predicted = predict(lm(log(price) ~ 1 + alcohol_percent + category_1 + alcohol_percent:category_1, data = df))) %>% 
  ggplot(aes(x = alcohol_percent, y = predicted, color = category_1)) +
  geom_point(aes(y = log(price)), size = 2, alpha = 0.08) +
  geom_line(size = 2, alpha = 1) +
  labs(
    title = "Interaction Effect of Alcohol Percent and Category on Log(Price)",
    x = "Alcohol Percent",
    y = "Predicted Log(Price)",
    color = "Category"
  ) +
  theme_minimal()
Code

Fixed Effect Model


Call:
lm(formula = log(price) ~ 1 + alcohol_percent, data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-2.6598 -0.6514 -0.0166  0.5375  6.5906 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)     4.5752080  0.0108671   421.0   <2e-16 ***
alcohol_percent 0.0496754  0.0004863   102.2   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.025 on 24909 degrees of freedom
Multiple R-squared:  0.2952,    Adjusted R-squared:  0.2952 
F-statistic: 1.043e+04 on 1 and 24909 DF,  p-value: < 2.2e-16
lm(formula = log(price) ~ 1 + alcohol_percent,
   data = df) %>% 
  summary()
Code
Oneway (individual) effect Within Model

Call:
plm(formula = log(price) ~ 1 + alcohol_percent, data = df, model = "within", 
    index = "category_1")

Unbalanced Panel: n = 4, T = 412-15189, N = 24911

Residuals:
    Min.  1st Qu.   Median  3rd Qu.     Max. 
-2.44806 -0.46050 -0.13303  0.31490  6.86099 

Coefficients:
                 Estimate Std. Error t-value  Pr(>|t|)    
alcohol_percent 0.0519468  0.0010669  48.688 < 2.2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Total Sum of Squares:    17433
Residual Sum of Squares: 15918
R-Squared:      0.086906
Adj. R-Squared: 0.086759
F-statistic: 2370.49 on 1 and 24906 DF, p-value: < 2.22e-16
plm(formula = log(price) ~ 1 + alcohol_percent,
    index = "category_1",
    model = "within",
    data = df) %>% 
summary()
Code

df %>%
  group_by(category_1) %>%
  mutate(mean_alcohol_percent = mean(alcohol_percent)) %>%
  mutate(mean_log_price = mean(log(price))) %>% 
  ungroup() %>% 
  mutate(normalized_alcohol_percent = alcohol_percent - mean_alcohol_percent) %>% 
  mutate(normalized_log_price = log(price) - mean_log_price) %>%
  ggplot(aes(x = normalized_alcohol_percent, y = log(price), color = category_1)) +
  geom_point(alpha=0.2) +  # Standard scatter plot
  labs(title = "Scatterplot of normalized log(price) vs normalized alcohol_percent",
       x = "Normalized Alcohol Percent",
       y = "Normalized Log(Price)") +
  theme_minimal()
Code