Star Trek ๐์์ ์ธ๊ฐ/์ปดํจํฐ ์ํธ ์์ฉ ๋ชจ๋ธ๋ง
12167 ๋จ์ด rstatsdatasciencemachinelearningtutorial
#TidyTuesday
dataset์ ํตํด ๊ธฐ๋ฅ ์์ง๋์ด๋ง ๋ฐ ๋ชจ๋ธ๋ง ์ ๊ทผ ๋ฐฉ์์ ์ฌ๋ฌ ์กฐํฉ์ ํ๊ฐํ๋ ๋ฐฉ๋ฒ์ ๋ํ ๊ณ ๊ธ ์ฃผ์ ์ ๊ดํ ๊ฒ์
๋๋ค.๋ค์์ ๋น๋์ค ๋์ ๋๋ ๋น๋์ค์ ์ถ๊ฐํ์ฌ ์ฝ๊ธฐ๋ฅผ ์ ํธํ๋ ์ฌ๋๋ค์ ์ํด ๋น๋์ค์์ ์ฌ์ฉํ ์ฝ๋์ ๋๋ค.
๋ฐ์ดํฐ ํ์
์ฐ๋ฆฌ์ ๋ชจ๋ธ๋ง ๋ชฉํ๋ ์ฌ๋์ด ๋งํ ๊ฒ๊ณผ ์ปดํจํฐ๊ฐ ๋งํ ๊ฒcomputer interactions from Star Trek์ ์์ธกํ๋ ๊ฒ์ ๋๋ค.
library(tidyverse)
computer_raw <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-08-17/computer.csv")
computer_raw %>%
distinct(value_id, .keep_all = TRUE) %>%
count(char_type)
## # A tibble: 2 ร 2
## char_type n
## <chr> <int>
## 1 Computer 178
## 2 Person 234
์ปดํจํฐ์ ์ฌ๋์ด ๋งํ ๊ฐ๋ฅ์ฑ์ด ๋ ๋์ ๋จ์ด๋ ๋ฌด์์ ๋๊น?
library(tidytext)
library(tidylo)
computer_counts <-
computer_raw %>%
distinct(value_id, .keep_all = TRUE) %>%
unnest_tokens(word, interaction) %>%
count(char_type, word, sort = TRUE)
computer_counts %>%
bind_log_odds(char_type, word, n) %>%
filter(n > 10) %>%
group_by(char_type) %>%
slice_max(log_odds_weighted, n = 10) %>%
ungroup() %>%
ggplot(aes(log_odds_weighted,
fct_reorder(word, log_odds_weighted),
fill = char_type
)) +
geom_col(alpha = 0.8, show.legend = FALSE) +
facet_wrap(vars(char_type), scales = "free_y") +
labs(y = NULL)
๋ถ์ฉ์ด๋ ๊ฐ์ค ๋ก๊ทธ ์น์ฐ์ด ๊ฐ์ฅ ๋์ ๋จ์ด์ ์ํฉ๋๋ค. ๊ทธ๋ค์ ์ด ์ํฉ์์ ๋งค์ฐ ์ ์ตํฉ๋๋ค.
๋ชจ๋ธ ๊ตฌ์ถ ๋ฐ ๋น๊ต
"๋ฐ์ดํฐ ์์ฐ"์ ์ค์ ํ์ฌ ๋ชจ๋ธ๋ง์ ์์ํ๊ฒ ์ต๋๋ค. ์ด๊ฒ์ ๋งค์ฐ ์์ ๋ฐ์ดํฐ ์ธํธ์ด๋ฏ๋ก ๋ชจ๋ธ์์ ๋๋ผ์ด ๊ฒฐ๊ณผ๋ฅผ ๊ธฐ๋ํ์ง๋ ์์ง๋ง ์ด๋ฌํ ๊ฐ๋ ์ค ์ผ๋ถ๋ฅผ ๋ณด์ฌ์ฃผ๋ ๊ฒ์ ์ฌ๋ฏธ์๊ณ ์ข์ ๋ฐฉ๋ฒ์ ๋๋ค.
library(tidymodels)
set.seed(123)
comp_split <-
computer_raw %>%
distinct(value_id, .keep_all = TRUE) %>%
select(char_type, interaction) %>%
initial_split(prop = 0.8, strata = char_type)
comp_train <- training(comp_split)
comp_test <- testing(comp_split)
set.seed(234)
comp_folds <- bootstraps(comp_train, strata = char_type)
comp_folds
## # Bootstrap sampling using stratification
## # A tibble: 25 ร 2
## splits id
## <list> <chr>
## 1 <split [329/118]> Bootstrap01
## 2 <split [329/128]> Bootstrap02
## 3 <split [329/134]> Bootstrap03
## 4 <split [329/124]> Bootstrap04
## 5 <split [329/118]> Bootstrap05
## 6 <split [329/116]> Bootstrap06
## 7 <split [329/106]> Bootstrap07
## 8 <split [329/124]> Bootstrap08
## 9 <split [329/121]> Bootstrap09
## 10 <split [329/121]> Bootstrap10
## # โฆ with 15 more rows
๊ธฐ๋ฅ ์์ง๋์ด๋ง๊ณผ ๊ด๋ จํ์ฌ ๋ถ์ฉ์ด๋ฅผ ์ ๊ฑฐํด์ผ ํ๋์ง, ์์ธก ๋ณ์๋ฅผ ์ค์์ ๋๊ณ ํฌ๊ธฐ๋ฅผ ์กฐ์ ํด์ผ ํ๋์ง, ํด๋์ค์ ๊ท ํ์ ๋ง์ถฐ์ผ ํ๋์ง ๋ฏธ๋ฆฌ ์ ์ ์์ต๋๋ค. ์ฑ๋ฅ์ ๋น๊ตํ ์ ์๋๋ก ์ด๋ฌํ ๋ชจ๋ ์์ ์ ์ํํ๋ ๊ธฐ๋ฅ ์์ง๋์ด๋ง ๋ ์ํผ๋ฅผ ๋ง๋ค์ด ๋ณด๊ฒ ์ต๋๋ค.
library(textrecipes)
library(themis)
rec_all <-
recipe(char_type ~ interaction, data = comp_train) %>%
step_tokenize(interaction) %>%
step_tokenfilter(interaction, max_tokens = 80) %>%
step_tfidf(interaction)
rec_all_norm <-
rec_all %>%
step_normalize(all_predictors())
rec_all_smote <-
rec_all_norm %>%
step_smote(char_type)
## we can `prep()` just to check if it works
prep(rec_all_smote)
## Data Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 1
##
## Training data contained 329 data points and no missing data.
##
## Operations:
##
## Tokenization for interaction [trained]
## Text filtering for interaction [trained]
## Term frequency-inverse document frequency with interaction [trained]
## Centering and scaling for tfidf_interaction_a, ... [trained]
## SMOTE based on char_type [trained]
์ด์ ๋ถ์ฉ์ด๋ฅผ ์ ๊ฑฐํ๋ ๊ฒ๊ณผ ๋์ผํ ์์ ์ ์ํํด ๋ณด๊ฒ ์ต๋๋ค.
rec_stop <-
recipe(char_type ~ interaction, data = comp_train) %>%
step_tokenize(interaction) %>%
step_stopwords(interaction) %>%
step_tokenfilter(interaction, max_tokens = 80) %>%
step_tfidf(interaction)
rec_stop_norm <-
rec_stop %>%
step_normalize(all_predictors())
rec_stop_smote <-
rec_stop_norm %>%
step_smote(char_type)
## again, let's check it
prep(rec_stop_smote)
## Data Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 1
##
## Training data contained 329 data points and no missing data.
##
## Operations:
##
## Tokenization for interaction [trained]
## Stop word removal for interaction [trained]
## Text filtering for interaction [trained]
## Term frequency-inverse document frequency with interaction [trained]
## Centering and scaling for 80 items [trained]
## SMOTE based on char_type [trained]
ํ ์คํธ ๋ฐ์ดํฐ์ ์ ์๋ํ๋ ๋ ๊ฐ์ง ๋ชจ๋ธ์ธ ์ํฌํธ ๋ฒกํฐ ๋จธ์ ๊ณผ ๋์ด๋ธ ๋ฒ ์ด์ฆ ๋ชจ๋ธ์ ์ฌ์ฉํด ๋ด ์๋ค.
library(discrim)
nb_spec <-
naive_Bayes() %>%
set_mode("classification") %>%
set_engine("naivebayes")
nb_spec
## Naive Bayes Model Specification (classification)
##
## Computational engine: naivebayes
svm_spec <-
svm_linear() %>%
set_mode("classification") %>%
set_engine("LiblineaR")
svm_spec
## Linear Support Vector Machine Specification (classification)
##
## Computational engine: LiblineaR
์ด์ ์ฐ๋ฆฌ๋ ์ด ๋ชจ๋ ๊ฒ์ workflowset์ ํจ๊ป ๋ฃ์ ์ ์์ต๋๋ค.
comp_models <-
workflow_set(
preproc = list(
all = rec_all,
all_norm = rec_all_norm,
all_smote = rec_all_smote,
stop = rec_stop,
stop_norm = rec_stop_norm,
stop_smote = rec_stop_smote
),
models = list(nb = nb_spec, svm = svm_spec),
cross = TRUE
)
comp_models
## # A workflow set/tibble: 12 ร 4
## wflow_id info option result
## <chr> <list> <list> <list>
## 1 all_nb <tibble [1 ร 4]> <opts[0]> <list [0]>
## 2 all_svm <tibble [1 ร 4]> <opts[0]> <list [0]>
## 3 all_norm_nb <tibble [1 ร 4]> <opts[0]> <list [0]>
## 4 all_norm_svm <tibble [1 ร 4]> <opts[0]> <list [0]>
## 5 all_smote_nb <tibble [1 ร 4]> <opts[0]> <list [0]>
## 6 all_smote_svm <tibble [1 ร 4]> <opts[0]> <list [0]>
## 7 stop_nb <tibble [1 ร 4]> <opts[0]> <list [0]>
## 8 stop_svm <tibble [1 ร 4]> <opts[0]> <list [0]>
## 9 stop_norm_nb <tibble [1 ร 4]> <opts[0]> <list [0]>
## 10 stop_norm_svm <tibble [1 ร 4]> <opts[0]> <list [0]>
## 11 stop_smote_nb <tibble [1 ร 4]> <opts[0]> <list [0]>
## 12 stop_smote_svm <tibble [1 ร 4]> <opts[0]> <list [0]>
์ด๋ฌํ ๋ชจ๋ธ์๋ ์กฐ์ ๋งค๊ฐ๋ณ์๊ฐ ์์ผ๋ฏ๋ก
fit_resamples()
๋ฅผ ์ฌ์ฉํ์ฌ ๋ถํธ์คํธ๋ฉ ๋ฆฌ์ํ์ ์ฌ์ฉํ์ฌ ๊ธฐ๋ฅ ์์ง๋์ด๋ง ๋ ์ํผ์ ๋ชจ๋ธ ์ฌ์์ ๊ฐ ์กฐํฉ์ด ์ด๋ป๊ฒ ์ํ๋๋์ง ํ๊ฐํด ๋ณด๊ฒ ์ต๋๋ค.set.seed(123)
doParallel::registerDoParallel()
computer_rs <-
comp_models %>%
workflow_map(
"fit_resamples",
resamples = comp_folds,
metrics = metric_set(accuracy, sensitivity, specificity)
)
์ฐ๋ฆฌ๋ ์ด๋ฌํ ๊ฒฐ๊ณผ๋ฅผ ๋น ๋ฅด๊ฒ ๋์ ์์ค์ผ๋ก ์๊ฐํํ ์ ์์ต๋๋ค.
autoplot(computer_rs)
๋ชจ๋ SVM์ ์ ์ด๋ ์ ๋ฐ์ ์ธ ์ ํ๋์ ์์ด์๋ ๋ชจ๋ ์์งํ Bayes ๋ชจ๋ธ๋ณด๋ค ๋ ๋์์ต๋๋ค. ๋ํ ๋ ๊น์ด ํ๊ณ ๋ค์ด ๊ฒฐ๊ณผ๋ฅผ ๋ ๋ง์ด ํ์ํ ์ ์์ต๋๋ค.
rank_results(computer_rs) %>%
filter(.metric == "accuracy")
## # A tibble: 12 ร 9
## wflow_id .config .metric mean std_err n preprocessor model rank
## <chr> <chr> <chr> <dbl> <dbl> <int> <chr> <chr> <int>
## 1 all_svm Preprocessโฆ accuracy 0.679 0.00655 25 recipe svm_lโฆ 1
## 2 all_norm_โฆ Preprocessโฆ accuracy 0.658 0.00756 25 recipe svm_lโฆ 2
## 3 stop_svm Preprocessโฆ accuracy 0.652 0.00700 25 recipe svm_lโฆ 3
## 4 all_smoteโฆ Preprocessโฆ accuracy 0.650 0.00611 25 recipe svm_lโฆ 4
## 5 stop_normโฆ Preprocessโฆ accuracy 0.646 0.00753 25 recipe svm_lโฆ 5
## 6 stop_smotโฆ Preprocessโฆ accuracy 0.632 0.00914 25 recipe svm_lโฆ 6
## 7 all_norm_โฆ Preprocessโฆ accuracy 0.589 0.00678 25 recipe naiveโฆ 7
## 8 all_smoteโฆ Preprocessโฆ accuracy 0.575 0.0115 25 recipe naiveโฆ 8
## 9 stop_smotโฆ Preprocessโฆ accuracy 0.573 0.00971 25 recipe naiveโฆ 9
## 10 stop_normโฆ Preprocessโฆ accuracy 0.571 0.00950 25 recipe naiveโฆ 10
## 11 all_nb Preprocessโฆ accuracy 0.570 0.0102 25 recipe naiveโฆ 11
## 12 stop_nb Preprocessโฆ accuracy 0.559 0.0120 25 recipe naiveโฆ 12
์ฃผ๋ชฉํด์ผ ํ ๋ช ๊ฐ์ง ํฅ๋ฏธ๋ก์ด ์ฌํญ์ ๋ค์๊ณผ ๊ฐ์ต๋๋ค.
์ต์ข ๋ชจ๋ธ ํ์ต ๋ฐ ํ๊ฐ
์ ์ฒด ์ ํ๋๋ฅผ ๋๊ฒ ์ ์งํ๊ณ ์ถ์ผ๋ฏ๋ก
rec_all
๋ฐ svm_spec
๋ฅผ ์ ํํฉ๋๋ค. last_fit()
๋ฅผ ์ฌ์ฉํ์ฌ ๋ชจ๋ ๊ต์ก ๋ฐ์ดํฐ์ ํ ๋ฒ ์ ํฉํ๊ณ ํ
์คํธ ๋ฐ์ดํฐ์์ ํ ๋ฒ ํ๊ฐํ ์ ์์ต๋๋ค.comp_wf <- workflow(rec_all, svm_spec)
comp_fitted <-
last_fit(
comp_wf,
comp_split,
metrics = metric_set(accuracy, sensitivity, specificity)
)
comp_fitted
## # Resampling results
## # Manual resampling
## # A tibble: 1 ร 6
## splits id .metrics .notes .predictions .workflow
## <list> <chr> <list> <list> <list> <list>
## 1 <split [329/83]> train/test split <tibble [โฆ <tibble โฆ <tibble [83 โฆ <workfloโฆ
์ด๋ป๊ฒ ๋์์ต๋๊น?
collect_metrics(comp_fitted)
## # A tibble: 3 ร 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 accuracy binary 0.735 Preprocessor1_Model1
## 2 sens binary 0.611 Preprocessor1_Model1
## 3 spec binary 0.830 Preprocessor1_Model1
์์ธก์ ๋ณผ ์๋ ์๊ณ ์๋ฅผ ๋ค์ด ํผ๋ ํ๋ ฌ์ ๋ง๋ค ์๋ ์์ต๋๋ค.
collect_predictions(comp_fitted) %>%
conf_mat(char_type, .pred_class) %>%
autoplot()
๋ค๋ฅธ ๋ฐฉ๋ฒ๋ณด๋ค ์ปดํจํฐ์ ๋ํํ๋ ์ฌ๋์ ์๋ณํ๋ ๊ฒ์ด ๋ ์ฌ์ ์ต๋๋ค.
์ด๊ฒ์ ์ ํ ๋ชจ๋ธ์ด๊ธฐ ๋๋ฌธ์ ๊ฐ ๋ฐฉํฅ์์ ๊ฐ์ฅ ํฐ ํจ๊ณผ ํฌ๊ธฐ ํญ์ ๋ํด ๋ชจ๋ธ์ ๋จ์ด์ ๋ํ ๊ณ์๋ฅผ ๋ณผ ์๋ ์์ต๋๋ค.
extract_workflow(comp_fitted) %>%
tidy() %>%
group_by(estimate > 0) %>%
slice_max(abs(estimate), n = 10) %>%
ungroup() %>%
mutate(term = str_remove(term, "tfidf_interaction_")) %>%
ggplot(aes(estimate, fct_reorder(term, estimate), fill = estimate > 0)) +
geom_col(alpha = 0.8) +
scale_fill_discrete(labels = c("people", "computer")) +
labs(y = NULL, fill = "More from...")
Reference
์ด ๋ฌธ์ ์ ๊ดํ์ฌ(Star Trek ๐์์ ์ธ๊ฐ/์ปดํจํฐ ์ํธ ์์ฉ ๋ชจ๋ธ๋ง), ์ฐ๋ฆฌ๋ ์ด๊ณณ์์ ๋ ๋ง์ ์๋ฃ๋ฅผ ๋ฐ๊ฒฌํ๊ณ ๋งํฌ๋ฅผ ํด๋ฆญํ์ฌ ๋ณด์๋ค https://dev.to/juliasilge/modeling-human-computer-interactions-on-star-trek-4l6bํ ์คํธ๋ฅผ ์์ ๋กญ๊ฒ ๊ณต์ ํ๊ฑฐ๋ ๋ณต์ฌํ ์ ์์ต๋๋ค.ํ์ง๋ง ์ด ๋ฌธ์์ URL์ ์ฐธ์กฐ URL๋ก ๋จ๊ฒจ ๋์ญ์์ค.
์ฐ์ํ ๊ฐ๋ฐ์ ์ฝํ ์ธ ๋ฐ๊ฒฌ์ ์ ๋ (Collection and Share based on the CC Protocol.)
์ข์ ์นํ์ด์ง ์ฆ๊ฒจ์ฐพ๊ธฐ
๊ฐ๋ฐ์ ์ฐ์ ์ฌ์ดํธ ์์ง
๊ฐ๋ฐ์๊ฐ ์์์ผ ํ ํ์ ์ฌ์ดํธ 100์ ์ถ์ฒ ์ฐ๋ฆฌ๋ ๋น์ ์ ์ํด 100๊ฐ์ ์์ฃผ ์ฌ์ฉํ๋ ๊ฐ๋ฐ์ ํ์ต ์ฌ์ดํธ๋ฅผ ์ ๋ฆฌํ์ต๋๋ค