Star Trek ๐Ÿ––์—์„œ ์ธ๊ฐ„/์ปดํ“จํ„ฐ ์ƒํ˜ธ ์ž‘์šฉ ๋ชจ๋ธ๋ง

์ด๊ฒƒ์€ screencasts ํŒจํ‚ค์ง€๋ฅผ ์‚ฌ์šฉํ•˜๋Š” ๋ฐฉ๋ฒ•์„ ๋ณด์—ฌ์ฃผ๋Š” ์ตœ์‹  ์‹œ๋ฆฌ์ฆˆtidymodels๋กœ, ์ด์ œ ๋ง‰ ์‹œ์ž‘ํ•˜๋Š” ๊ฒƒ๋ถ€ํ„ฐ ๋” ๋ณต์žกํ•œ ๋ชจ๋ธ์„ ์กฐ์ •ํ•˜๋Š” ๊ฒƒ๊นŒ์ง€ ํฌํ•จํ•ฉ๋‹ˆ๋‹ค. ์˜ค๋Š˜์˜ ์Šคํฌ๋ฆฐ์บ์ŠคํŠธ๋Š” Star Trek ์ธ๊ฐ„/์ปดํ“จํ„ฐ ์ƒํ˜ธ ์ž‘์šฉ์— ๋Œ€ํ•œ ์ด๋ฒˆ ์ฃผworkflowsets์™€ ํ•จ๊ป˜ #TidyTuesday dataset์„ ํ†ตํ•ด ๊ธฐ๋Šฅ ์—”์ง€๋‹ˆ์–ด๋ง ๋ฐ ๋ชจ๋ธ๋ง ์ ‘๊ทผ ๋ฐฉ์‹์˜ ์—ฌ๋Ÿฌ ์กฐํ•ฉ์„ ํ‰๊ฐ€ํ•˜๋Š” ๋ฐฉ๋ฒ•์— ๋Œ€ํ•œ ๊ณ ๊ธ‰ ์ฃผ์ œ์— ๊ด€ํ•œ ๊ฒƒ์ž…๋‹ˆ๋‹ค.



๋‹ค์Œ์€ ๋น„๋””์˜ค ๋Œ€์‹  ๋˜๋Š” ๋น„๋””์˜ค์— ์ถ”๊ฐ€ํ•˜์—ฌ ์ฝ๊ธฐ๋ฅผ ์„ ํ˜ธํ•˜๋Š” ์‚ฌ๋žŒ๋“ค์„ ์œ„ํ•ด ๋น„๋””์˜ค์—์„œ ์‚ฌ์šฉํ•œ ์ฝ”๋“œ์ž…๋‹ˆ๋‹ค.

๋ฐ์ดํ„ฐ ํƒ์ƒ‰



์šฐ๋ฆฌ์˜ ๋ชจ๋ธ๋ง ๋ชฉํ‘œ๋Š” ์‚ฌ๋žŒ์ด ๋งํ•œ ๊ฒƒ๊ณผ ์ปดํ“จํ„ฐ๊ฐ€ ๋งํ•œ ๊ฒƒcomputer interactions from Star Trek์„ ์˜ˆ์ธกํ•˜๋Š” ๊ฒƒ์ž…๋‹ˆ๋‹ค.

library(tidyverse)
computer_raw <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-08-17/computer.csv")

computer_raw %>%
  distinct(value_id, .keep_all = TRUE) %>%
  count(char_type)


## # A tibble: 2 ร— 2
## char_type n
## <chr> <int>
## 1 Computer 178
## 2 Person 234



์ปดํ“จํ„ฐ์™€ ์‚ฌ๋žŒ์ด ๋งํ•  ๊ฐ€๋Šฅ์„ฑ์ด ๋” ๋†’์€ ๋‹จ์–ด๋Š” ๋ฌด์—‡์ž…๋‹ˆ๊นŒ?

library(tidytext)
library(tidylo)

computer_counts <-
  computer_raw %>%
  distinct(value_id, .keep_all = TRUE) %>%
  unnest_tokens(word, interaction) %>%
  count(char_type, word, sort = TRUE)

computer_counts %>%
  bind_log_odds(char_type, word, n) %>%
  filter(n > 10) %>%
  group_by(char_type) %>%
  slice_max(log_odds_weighted, n = 10) %>%
  ungroup() %>%
  ggplot(aes(log_odds_weighted,
    fct_reorder(word, log_odds_weighted),
    fill = char_type
  )) +
  geom_col(alpha = 0.8, show.legend = FALSE) +
  facet_wrap(vars(char_type), scales = "free_y") +
  labs(y = NULL)





๋ถˆ์šฉ์–ด๋Š” ๊ฐ€์ค‘ ๋กœ๊ทธ ์Šน์‚ฐ์ด ๊ฐ€์žฅ ๋†’์€ ๋‹จ์–ด์— ์†ํ•ฉ๋‹ˆ๋‹ค. ๊ทธ๋“ค์€ ์ด ์ƒํ™ฉ์—์„œ ๋งค์šฐ ์œ ์ตํ•ฉ๋‹ˆ๋‹ค.

๋ชจ๋ธ ๊ตฌ์ถ• ๋ฐ ๋น„๊ต



"๋ฐ์ดํ„ฐ ์˜ˆ์‚ฐ"์„ ์„ค์ •ํ•˜์—ฌ ๋ชจ๋ธ๋ง์„ ์‹œ์ž‘ํ•˜๊ฒ ์Šต๋‹ˆ๋‹ค. ์ด๊ฒƒ์€ ๋งค์šฐ ์ž‘์€ ๋ฐ์ดํ„ฐ ์„ธํŠธ์ด๋ฏ€๋กœ ๋ชจ๋ธ์—์„œ ๋†€๋ผ์šด ๊ฒฐ๊ณผ๋ฅผ ๊ธฐ๋Œ€ํ•˜์ง€๋Š” ์•Š์ง€๋งŒ ์ด๋Ÿฌํ•œ ๊ฐœ๋… ์ค‘ ์ผ๋ถ€๋ฅผ ๋ณด์—ฌ์ฃผ๋Š” ๊ฒƒ์€ ์žฌ๋ฏธ์žˆ๊ณ  ์ข‹์€ ๋ฐฉ๋ฒ•์ž…๋‹ˆ๋‹ค.

library(tidymodels)

set.seed(123)

comp_split <-
  computer_raw %>%
  distinct(value_id, .keep_all = TRUE) %>%
  select(char_type, interaction) %>%
  initial_split(prop = 0.8, strata = char_type)

comp_train <- training(comp_split)
comp_test <- testing(comp_split)

set.seed(234)
comp_folds <- bootstraps(comp_train, strata = char_type)
comp_folds


## # Bootstrap sampling using stratification 
## # A tibble: 25 ร— 2
## splits id         
## <list> <chr>      
## 1 <split [329/118]> Bootstrap01
## 2 <split [329/128]> Bootstrap02
## 3 <split [329/134]> Bootstrap03
## 4 <split [329/124]> Bootstrap04
## 5 <split [329/118]> Bootstrap05
## 6 <split [329/116]> Bootstrap06
## 7 <split [329/106]> Bootstrap07
## 8 <split [329/124]> Bootstrap08
## 9 <split [329/121]> Bootstrap09
## 10 <split [329/121]> Bootstrap10
## # โ€ฆ with 15 more rows



๊ธฐ๋Šฅ ์—”์ง€๋‹ˆ์–ด๋ง๊ณผ ๊ด€๋ จํ•˜์—ฌ ๋ถˆ์šฉ์–ด๋ฅผ ์ œ๊ฑฐํ•ด์•ผ ํ•˜๋Š”์ง€, ์˜ˆ์ธก ๋ณ€์ˆ˜๋ฅผ ์ค‘์•™์— ๋†“๊ณ  ํฌ๊ธฐ๋ฅผ ์กฐ์ •ํ•ด์•ผ ํ•˜๋Š”์ง€, ํด๋ž˜์Šค์˜ ๊ท ํ˜•์„ ๋งž์ถฐ์•ผ ํ•˜๋Š”์ง€ ๋ฏธ๋ฆฌ ์•Œ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค. ์„ฑ๋Šฅ์„ ๋น„๊ตํ•  ์ˆ˜ ์žˆ๋„๋ก ์ด๋Ÿฌํ•œ ๋ชจ๋“  ์ž‘์—…์„ ์ˆ˜ํ–‰ํ•˜๋Š” ๊ธฐ๋Šฅ ์—”์ง€๋‹ˆ์–ด๋ง ๋ ˆ์‹œํ”ผ๋ฅผ ๋งŒ๋“ค์–ด ๋ณด๊ฒ ์Šต๋‹ˆ๋‹ค.

library(textrecipes)
library(themis)

rec_all <-
  recipe(char_type ~ interaction, data = comp_train) %>%
  step_tokenize(interaction) %>%
  step_tokenfilter(interaction, max_tokens = 80) %>%
  step_tfidf(interaction)

rec_all_norm <-
  rec_all %>%
  step_normalize(all_predictors())

rec_all_smote <-
  rec_all_norm %>%
  step_smote(char_type)

## we can `prep()` just to check if it works
prep(rec_all_smote)


## Data Recipe
## 
## Inputs:
## 
## role #variables
## outcome 1
## predictor 1
## 
## Training data contained 329 data points and no missing data.
## 
## Operations:
## 
## Tokenization for interaction [trained]
## Text filtering for interaction [trained]
## Term frequency-inverse document frequency with interaction [trained]
## Centering and scaling for tfidf_interaction_a, ... [trained]
## SMOTE based on char_type [trained]



์ด์ œ ๋ถˆ์šฉ์–ด๋ฅผ ์ œ๊ฑฐํ•˜๋Š” ๊ฒƒ๊ณผ ๋™์ผํ•œ ์ž‘์—…์„ ์ˆ˜ํ–‰ํ•ด ๋ณด๊ฒ ์Šต๋‹ˆ๋‹ค.

rec_stop <-
  recipe(char_type ~ interaction, data = comp_train) %>%
  step_tokenize(interaction) %>%
  step_stopwords(interaction) %>%
  step_tokenfilter(interaction, max_tokens = 80) %>%
  step_tfidf(interaction)

rec_stop_norm <-
  rec_stop %>%
  step_normalize(all_predictors())

rec_stop_smote <-
  rec_stop_norm %>%
  step_smote(char_type)

## again, let's check it
prep(rec_stop_smote)


## Data Recipe
## 
## Inputs:
## 
## role #variables
## outcome 1
## predictor 1
## 
## Training data contained 329 data points and no missing data.
## 
## Operations:
## 
## Tokenization for interaction [trained]
## Stop word removal for interaction [trained]
## Text filtering for interaction [trained]
## Term frequency-inverse document frequency with interaction [trained]
## Centering and scaling for 80 items [trained]
## SMOTE based on char_type [trained]



ํ…์ŠคํŠธ ๋ฐ์ดํ„ฐ์— ์ž˜ ์ž‘๋™ํ•˜๋Š” ๋‘ ๊ฐ€์ง€ ๋ชจ๋ธ์ธ ์„œํฌํŠธ ๋ฒกํ„ฐ ๋จธ์‹ ๊ณผ ๋‚˜์ด๋ธŒ ๋ฒ ์ด์ฆˆ ๋ชจ๋ธ์„ ์‚ฌ์šฉํ•ด ๋ด…์‹œ๋‹ค.

library(discrim)

nb_spec <-
  naive_Bayes() %>%
  set_mode("classification") %>%
  set_engine("naivebayes")

nb_spec


## Naive Bayes Model Specification (classification)
## 
## Computational engine: naivebayes


svm_spec <-
  svm_linear() %>%
  set_mode("classification") %>%
  set_engine("LiblineaR")

svm_spec


## Linear Support Vector Machine Specification (classification)
## 
## Computational engine: LiblineaR



์ด์ œ ์šฐ๋ฆฌ๋Š” ์ด ๋ชจ๋“  ๊ฒƒ์„ workflowset์— ํ•จ๊ป˜ ๋„ฃ์„ ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.

comp_models <-
  workflow_set(
    preproc = list(
      all = rec_all,
      all_norm = rec_all_norm,
      all_smote = rec_all_smote,
      stop = rec_stop,
      stop_norm = rec_stop_norm,
      stop_smote = rec_stop_smote
    ),
    models = list(nb = nb_spec, svm = svm_spec),
    cross = TRUE
  )

comp_models


## # A workflow set/tibble: 12 ร— 4
## wflow_id info option result    
## <chr> <list> <list> <list>    
## 1 all_nb <tibble [1 ร— 4]> <opts[0]> <list [0]>
## 2 all_svm <tibble [1 ร— 4]> <opts[0]> <list [0]>
## 3 all_norm_nb <tibble [1 ร— 4]> <opts[0]> <list [0]>
## 4 all_norm_svm <tibble [1 ร— 4]> <opts[0]> <list [0]>
## 5 all_smote_nb <tibble [1 ร— 4]> <opts[0]> <list [0]>
## 6 all_smote_svm <tibble [1 ร— 4]> <opts[0]> <list [0]>
## 7 stop_nb <tibble [1 ร— 4]> <opts[0]> <list [0]>
## 8 stop_svm <tibble [1 ร— 4]> <opts[0]> <list [0]>
## 9 stop_norm_nb <tibble [1 ร— 4]> <opts[0]> <list [0]>
## 10 stop_norm_svm <tibble [1 ร— 4]> <opts[0]> <list [0]>
## 11 stop_smote_nb <tibble [1 ร— 4]> <opts[0]> <list [0]>
## 12 stop_smote_svm <tibble [1 ร— 4]> <opts[0]> <list [0]>



์ด๋Ÿฌํ•œ ๋ชจ๋ธ์—๋Š” ์กฐ์ • ๋งค๊ฐœ๋ณ€์ˆ˜๊ฐ€ ์—†์œผ๋ฏ€๋กœ fit_resamples()๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ๋ถ€ํŠธ์ŠคํŠธ๋žฉ ๋ฆฌ์ƒ˜ํ”Œ์„ ์‚ฌ์šฉํ•˜์—ฌ ๊ธฐ๋Šฅ ์—”์ง€๋‹ˆ์–ด๋ง ๋ ˆ์‹œํ”ผ์™€ ๋ชจ๋ธ ์‚ฌ์–‘์˜ ๊ฐ ์กฐํ•ฉ์ด ์–ด๋–ป๊ฒŒ ์ˆ˜ํ–‰๋˜๋Š”์ง€ ํ‰๊ฐ€ํ•ด ๋ณด๊ฒ ์Šต๋‹ˆ๋‹ค.

set.seed(123)
doParallel::registerDoParallel()

computer_rs <-
  comp_models %>%
  workflow_map(
    "fit_resamples",
    resamples = comp_folds,
    metrics = metric_set(accuracy, sensitivity, specificity)
  )



์šฐ๋ฆฌ๋Š” ์ด๋Ÿฌํ•œ ๊ฒฐ๊ณผ๋ฅผ ๋น ๋ฅด๊ฒŒ ๋†’์€ ์ˆ˜์ค€์œผ๋กœ ์‹œ๊ฐํ™”ํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.

autoplot(computer_rs)





๋ชจ๋“  SVM์€ ์ ์–ด๋„ ์ „๋ฐ˜์ ์ธ ์ •ํ™•๋„์— ์žˆ์–ด์„œ๋Š” ๋ชจ๋“  ์ˆœ์ง„ํ•œ Bayes ๋ชจ๋ธ๋ณด๋‹ค ๋” ๋‚˜์•˜์Šต๋‹ˆ๋‹ค. ๋˜ํ•œ ๋” ๊นŠ์ด ํŒŒ๊ณ ๋“ค์–ด ๊ฒฐ๊ณผ๋ฅผ ๋” ๋งŽ์ด ํƒ์ƒ‰ํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.

rank_results(computer_rs) %>%
  filter(.metric == "accuracy")


## # A tibble: 12 ร— 9
## wflow_id .config .metric mean std_err n preprocessor model rank
## <chr> <chr> <chr> <dbl> <dbl> <int> <chr> <chr> <int>
## 1 all_svm Preprocessโ€ฆ accuracy 0.679 0.00655 25 recipe svm_lโ€ฆ 1
## 2 all_norm_โ€ฆ Preprocessโ€ฆ accuracy 0.658 0.00756 25 recipe svm_lโ€ฆ 2
## 3 stop_svm Preprocessโ€ฆ accuracy 0.652 0.00700 25 recipe svm_lโ€ฆ 3
## 4 all_smoteโ€ฆ Preprocessโ€ฆ accuracy 0.650 0.00611 25 recipe svm_lโ€ฆ 4
## 5 stop_normโ€ฆ Preprocessโ€ฆ accuracy 0.646 0.00753 25 recipe svm_lโ€ฆ 5
## 6 stop_smotโ€ฆ Preprocessโ€ฆ accuracy 0.632 0.00914 25 recipe svm_lโ€ฆ 6
## 7 all_norm_โ€ฆ Preprocessโ€ฆ accuracy 0.589 0.00678 25 recipe naiveโ€ฆ 7
## 8 all_smoteโ€ฆ Preprocessโ€ฆ accuracy 0.575 0.0115 25 recipe naiveโ€ฆ 8
## 9 stop_smotโ€ฆ Preprocessโ€ฆ accuracy 0.573 0.00971 25 recipe naiveโ€ฆ 9
## 10 stop_normโ€ฆ Preprocessโ€ฆ accuracy 0.571 0.00950 25 recipe naiveโ€ฆ 10
## 11 all_nb Preprocessโ€ฆ accuracy 0.570 0.0102 25 recipe naiveโ€ฆ 11
## 12 stop_nb Preprocessโ€ฆ accuracy 0.559 0.0120 25 recipe naiveโ€ฆ 12



์ฃผ๋ชฉํ•ด์•ผ ํ•  ๋ช‡ ๊ฐ€์ง€ ํฅ๋ฏธ๋กœ์šด ์‚ฌํ•ญ์€ ๋‹ค์Œ๊ณผ ๊ฐ™์Šต๋‹ˆ๋‹ค.
  • SMOTE๋ฅผ ํ†ตํ•ด ํด๋ž˜์Šค์˜ ๊ท ํ˜•์„ ์กฐ์ •ํ•˜๋ฉด ์‹ค์ œ๋กœ ๋ฏผ๊ฐ๋„์™€ ํŠน์ด๋„๊ฐ€ ์˜ˆ์ƒ๋Œ€๋กœ ๋ณ€๊ฒฝ๋ฉ๋‹ˆ๋‹ค
  • .
  • ๋ถˆ์šฉ์–ด๋ฅผ ์ œ๊ฑฐํ•˜๋Š” ๊ฒƒ์€ ๋Œ€๋ถ€๋ถ„ ๋‚˜์œ ์ƒ๊ฐ์ธ ๊ฒƒ ๊ฐ™์Šต๋‹ˆ๋‹ค!

  • ์ตœ์ข… ๋ชจ๋ธ ํ•™์Šต ๋ฐ ํ‰๊ฐ€



    ์ „์ฒด ์ •ํ™•๋„๋ฅผ ๋†’๊ฒŒ ์œ ์ง€ํ•˜๊ณ  ์‹ถ์œผ๋ฏ€๋กœ rec_all ๋ฐ svm_spec๋ฅผ ์„ ํƒํ•ฉ๋‹ˆ๋‹ค. last_fit()๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ๋ชจ๋“  ๊ต์œก ๋ฐ์ดํ„ฐ์— ํ•œ ๋ฒˆ ์ ํ•ฉํ•˜๊ณ  ํ…Œ์ŠคํŠธ ๋ฐ์ดํ„ฐ์—์„œ ํ•œ ๋ฒˆ ํ‰๊ฐ€ํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.

    comp_wf <- workflow(rec_all, svm_spec)
    
    comp_fitted <-
      last_fit(
        comp_wf,
        comp_split,
        metrics = metric_set(accuracy, sensitivity, specificity)
      )
    
    comp_fitted
    
    
    ## # Resampling results
    ## # Manual resampling 
    ## # A tibble: 1 ร— 6
    ## splits id .metrics .notes .predictions .workflow
    ## <list> <chr> <list> <list> <list> <list>   
    ## 1 <split [329/83]> train/test split <tibble [โ€ฆ <tibble โ€ฆ <tibble [83 โ€ฆ <workfloโ€ฆ
    
    


    ์–ด๋–ป๊ฒŒ ๋˜์—ˆ์Šต๋‹ˆ๊นŒ?

    collect_metrics(comp_fitted)
    
    
    ## # A tibble: 3 ร— 4
    ## .metric .estimator .estimate .config             
    ## <chr> <chr> <dbl> <chr>               
    ## 1 accuracy binary 0.735 Preprocessor1_Model1
    ## 2 sens binary 0.611 Preprocessor1_Model1
    ## 3 spec binary 0.830 Preprocessor1_Model1
    
    


    ์˜ˆ์ธก์„ ๋ณผ ์ˆ˜๋„ ์žˆ๊ณ  ์˜ˆ๋ฅผ ๋“ค์–ด ํ˜ผ๋™ ํ–‰๋ ฌ์„ ๋งŒ๋“ค ์ˆ˜๋„ ์žˆ์Šต๋‹ˆ๋‹ค.

    collect_predictions(comp_fitted) %>%
      conf_mat(char_type, .pred_class) %>%
      autoplot()
    
    




    ๋‹ค๋ฅธ ๋ฐฉ๋ฒ•๋ณด๋‹ค ์ปดํ“จํ„ฐ์™€ ๋Œ€ํ™”ํ•˜๋Š” ์‚ฌ๋žŒ์„ ์‹๋ณ„ํ•˜๋Š” ๊ฒƒ์ด ๋” ์‰ฌ์› ์Šต๋‹ˆ๋‹ค.

    ์ด๊ฒƒ์€ ์„ ํ˜• ๋ชจ๋ธ์ด๊ธฐ ๋•Œ๋ฌธ์— ๊ฐ ๋ฐฉํ–ฅ์—์„œ ๊ฐ€์žฅ ํฐ ํšจ๊ณผ ํฌ๊ธฐ ํ•ญ์— ๋Œ€ํ•ด ๋ชจ๋ธ์˜ ๋‹จ์–ด์— ๋Œ€ํ•œ ๊ณ„์ˆ˜๋ฅผ ๋ณผ ์ˆ˜๋„ ์žˆ์Šต๋‹ˆ๋‹ค.

    extract_workflow(comp_fitted) %>%
      tidy() %>%
      group_by(estimate > 0) %>%
      slice_max(abs(estimate), n = 10) %>%
      ungroup() %>%
      mutate(term = str_remove(term, "tfidf_interaction_")) %>%
      ggplot(aes(estimate, fct_reorder(term, estimate), fill = estimate > 0)) +
      geom_col(alpha = 0.8) +
      scale_fill_discrete(labels = c("people", "computer")) +
      labs(y = NULL, fill = "More from...")
    
    


    ์ข‹์€ ์›นํŽ˜์ด์ง€ ์ฆ๊ฒจ์ฐพ๊ธฐ