library(dplyr)
library(tidyverse)
library(tibble)
wine_data = read.csv('input/PCA_EXAMPLE/WINE.csv', row=1)
wine_data
pca_model = prcomp(wine_data, scale = TRUE)
hedonic | meat | dssert | price | sugar | alcohol | acidity | |
---|---|---|---|---|---|---|---|
<int> | <int> | <int> | <int> | <int> | <int> | <int> | |
wine_1 | 14 | 7 | 8 | 7 | 7 | 13 | 7 |
wine_2 | 10 | 7 | 6 | 4 | 3 | 14 | 7 |
wine_3 | 8 | 5 | 5 | 10 | 5 | 12 | 5 |
wine_4 | 2 | 4 | 7 | 16 | 7 | 11 | 3 |
wine_5 | 6 | 2 | 4 | 13 | 3 | 10 | 3 |
# eigenvalues are the variances of PCs
eigenvalues = (pca_model$sdev)^2
pc_names = colnames(pca_model$x)
df_scree = data.frame(pc_names, eigenvalues)
ggplot(data = df_scree, aes(x=pc_names, y=eigenvalues)) + geom_bar(stat="identity")
# loading matrix Q (squared elements of each PC sum to 1), this is different from the correlations between variable and PC
loading_Q = pca_model$rotation
loading_Q
# loading - the correlations between variable and PC
# PC_scores = pca_model$x # same as: scale(wine_data) %*% loading_Q[,1:2]
# df_temp = cbind(PC_scores, wine_data)
# loading_correlation = cor(df_temp)[rownames(loading_Q), colnames(loading_Q)]
# loading_correlation
PC1 | PC2 | PC3 | PC4 | PC5 | |
---|---|---|---|---|---|
hedonic | -0.39649537 | -0.11492093 | 0.802467762 | 0.05192087 | 1.628461e-01 |
meat | -0.44544114 | 0.10904271 | -0.281059542 | -0.27447822 | 6.783554e-01 |
dssert | -0.26456063 | 0.58542927 | -0.096071446 | 0.76029634 | -1.360023e-15 |
price | 0.41597371 | 0.31111971 | 0.007335518 | -0.09388916 | 5.675693e-01 |
sugar | -0.04852953 | 0.72445063 | 0.216110570 | -0.54740702 | -3.393711e-01 |
alcohol | -0.43850226 | -0.05545248 | -0.465755989 | -0.16874049 | -2.753171e-01 |
acidity | -0.45465130 | -0.08646543 | 0.064304541 | -0.08350114 | 1.441867e-02 |
colSums(loading_Q ^ 2)
rowSums(loading_Q ^ 2)
# before rotation
library(ggforce)
corr_circle = as.data.frame(loading_Q) %>% tibble::rownames_to_column('features') %>% select(features, PC1, PC2)
# corr_circle
p1 = ggplot(data = corr_circle, aes(x = PC1, y=PC2, label=features)) +
geom_point() + geom_text(hjust=0, vjust=0) + xlim(-1, 1) + ylim(-1,1) +
geom_circle(aes(x0 =0, y0 = 0, r = 1))
p1
# after rotation
varimax_Q <- varimax(loading_Q[,1:2])
loading_Q_varimax = loading_Q[,1:2] %*% varimax_Q$rotmat
colnames(loading_Q_varimax) = c('PC1', 'PC2')
loading_Q_varimax = as.data.frame(loading_Q_varimax) %>% tibble::rownames_to_column('features')
p2 = ggplot(data = loading_Q_varimax, aes(x = PC1, y=PC2, label=features)) +
geom_point() + geom_text(hjust=0, vjust=0) + xlim(-1, 1) + ylim(-1,1) +
geom_circle(aes(x0 =0, y0 = 0, r = 1))
p2
print('before rotation')
loading_Q[,1:2]
print('after rotation')
loading_Q_varimax
[1] "before rotation"
PC1 | PC2 | |
---|---|---|
hedonic | -0.39649537 | -0.11492093 |
meat | -0.44544114 | 0.10904271 |
dssert | -0.26456063 | 0.58542927 |
price | 0.41597371 | 0.31111971 |
sugar | -0.04852953 | 0.72445063 |
alcohol | -0.43850226 | -0.05545248 |
acidity | -0.45465130 | -0.08646543 |
[1] "after rotation"
features | PC1 | PC2 |
---|---|---|
<chr> | <dbl> | <dbl> |
hedonic | -0.4122397 | -0.02176773 |
meat | -0.4089853 | 0.20745881 |
dssert | -0.1245344 | 0.63024672 |
price | 0.4758140 | 0.20840020 |
sugar | 0.1174461 | 0.71651255 |
alcohol | -0.4396263 | 0.04569372 |
acidity | -0.4624033 | 0.01916442 |
# # after rotation
# varimax_Q <- varimax(loading_Q[,1:2])
# loading_Q_varimax = loading_Q[,1:2] %*% varimax_Q$rotmat
# colnames(loading_Q_varimax) = c('PC1', 'PC2')
# new_PC_score = scale(wine_data) %*% loading_Q_varimax
# df_temp = cbind(new_PC_score, wine_data)
# new_loading_correlation = cor(df_temp)[rownames(loading_Q_varimax), colnames(loading_Q_varimax)]
# new_loading_correlation
# library(ggforce)
# corr_circle = as.data.frame(new_loading_correlation) %>% tibble::rownames_to_column('features') %>% select(features, PC1, PC2)
# p2 = ggplot(data = corr_circle, aes(x = PC1, y=PC2, label=features)) +
# geom_point() + geom_text(hjust=0, vjust=0) + xlim(-1, 1) + ylim(-1,1) +
# geom_circle(aes(x0 =0, y0 = 0, r = 1))
# p2
food_data = read.csv('input/PCA_EXAMPLE/Francs.csv', row=1)
food_data
pca_model = prcomp(food_data, center = TRUE, scale. = FALSE)
bread | vegetables | fruit | meat | poultry | milk | wine | |
---|---|---|---|---|---|---|---|
<int> | <int> | <int> | <int> | <int> | <int> | <int> | |
BC2 | 332 | 428 | 354 | 1437 | 526 | 247 | 427 |
WC2 | 293 | 559 | 388 | 1527 | 567 | 239 | 258 |
UC2 | 372 | 767 | 562 | 1948 | 927 | 235 | 433 |
BC3 | 406 | 563 | 341 | 1507 | 544 | 324 | 407 |
WC3 | 386 | 608 | 396 | 1501 | 558 | 319 | 363 |
UC3 | 438 | 843 | 689 | 2345 | 1148 | 243 | 341 |
BC4 | 534 | 660 | 367 | 1620 | 638 | 414 | 407 |
WC4 | 460 | 699 | 484 | 1856 | 762 | 400 | 416 |
UC4 | 385 | 789 | 621 | 2366 | 1149 | 304 | 282 |
BC5 | 655 | 776 | 423 | 1848 | 759 | 495 | 486 |
WC5 | 584 | 995 | 548 | 2056 | 893 | 518 | 319 |
UC5 | 515 | 1097 | 887 | 2630 | 1167 | 561 | 284 |
# eigenvalues are the variances of PCs
eigenvalues = (pca_model$sdev)^2
variance_explained = eigenvalues/ sum(eigenvalues)
pc_names = colnames(pca_model$x)
df_scree = data.frame(pc_names, variance_explained)
ggplot(data = df_scree, aes(x=pc_names, y=variance_explained)) + geom_bar(stat="identity")
# loading matrix Q (squared elements of each PC sum to 1), this is different from the correlations between variable and PC
loading_Q = pca_model$rotation
# loading_Q
# loading - the correlations between variable and PC
PC_scores = pca_model$x # same as: scale(wine_data) %*% loading_Q[,1:2]
df_temp = cbind(PC_scores, food_data)
loading_correlation = cor(df_temp)[rownames(loading_Q), colnames(loading_Q)]
# loading_correlation
rowSums(loading_Q ^ 2)
colSums(loading_Q ^ 2)
# rowSums(loading_correlation ^ 2)
df_cases = as.data.frame(PC_scores[,1:2]) %>%
tibble::rownames_to_column('individuals') %>%
mutate(class = str_sub(individuals,1,2))
df_cases$class = factor(df_cases$class)
p1 = ggplot(data = df_cases, aes(x = PC1, y=PC2, label=individuals, color = class)) +
geom_point() + geom_text(hjust=0, vjust=0) + xlim(-1000, 1000) + ylim(-1000,1000)
p1
# # before rotation
# library(ggforce)
# corr_circle = as.data.frame(loading_Q) %>% tibble::rownames_to_column('features') %>% select(features, PC1, PC2)
# corr_circle
# p1 = ggplot(data = corr_circle, aes(x = PC1, y=PC2, label=features)) +
# geom_point() + geom_text(nudge_x =-0.1,nudge_y =0.1) + xlim(-1, 1) + ylim(-1,1) +
# geom_circle(aes(x0 =0, y0 = 0, r = 1))
# p1
# df_biplot = rbind(as.data.frame(loading_correlation)[1:2], as.data.frame(PC_scores[,1:2]))
# df_biplot$label = rep(c('variables','cases'), times=c(7,12))
# df_biplot$label = factor(df_biplot$label)
# df_biplot = df_biplot %>% tibble::rownames_to_column('names')
p_bi = biplot(pca_model)
p_bi
NULL
# different diagnostic label
who_gambled_2016 = read.csv('output/who_gambled_2016.csv', row =1)
who_gambled_2016 = who_gambled_2016$subjectkey
print(paste('who_gambled_2016:',length(who_gambled_2016)))
label = read.csv('output/year2016_gambler_PPGM_Label.csv', row =1) %>% select(subjectkey, diagnosis_ppgm)
data_2016 = read.csv('output/HarmSurvey2016.csv', row =1)
dim(data_2016)
gambler_data_2016 = data_2016 %>% filter(subjectkey %in% who_gambled_2016)
harm_data_temp = gambler_data_2016 %>% dplyr::select(subjectkey,starts_with('h'))
harm_data_temp = merge(harm_data_temp, label, by = 'subjectkey')
harm_data = harm_data_temp %>% tibble::column_to_rownames(var = 'subjectkey')
dim(harm_data)
head(harm_data,2)
[1] "who_gambled_2016: 5461"
hf_reduction_Available_money | hf_reduction_Saving | hf_reduction_recreation_expenses | hf_Increased_credit_card_debt | hf_Sold_personal_items | hf_reduction_essential_expenses | hf_reduction_beneficial_expenses | hf_late_bill_payments | hf_additional_employment | hf_need_welfare_organizations | ⋯ | ho_violence | ho_Petty_theft_dishonesty | ho_children_neglected | ho_less_connected_community | ho_outcast_community | ho_Reduced_contribution_community | ho_crime | ho_fake_promise_payback | ho_steal_ownhome | diagnosis_ppgm | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
<int> | <int> | <int> | <int> | <int> | <int> | <int> | <int> | <int> | <int> | ⋯ | <int> | <int> | <int> | <int> | <int> | <int> | <int> | <int> | <int> | <chr> | |
1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ⋯ | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | recreational_gambler |
2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ⋯ | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | recreational_gambler |
df=harm_data%>% select(starts_with('hf'),diagnosis_ppgm)
table = sapply(df[ ,1:ncol(df)-1], function(x) tapply(x, df[ ,ncol(df)], sum))
table
table_centered = scale( table, center = TRUE, scale = FALSE)
# table_normalized = table
# for (i in 1:ncol(table)){
# table_normalized[,i] = table[,i] / colSums(table)[[i]]
# table_normalized = scale( table_normalized, center = TRUE, scale = FALSE)
# }
# table_normalized
res.ca <- CA(table, graph = TRUE)
hf_reduction_Available_money | hf_reduction_Saving | hf_reduction_recreation_expenses | hf_Increased_credit_card_debt | hf_Sold_personal_items | hf_reduction_essential_expenses | hf_reduction_beneficial_expenses | hf_late_bill_payments | hf_additional_employment | hf_need_welfare_organizations | hf_need_temporary_accommodation | hf_Loss_assets | hf_Loss_utility_supply | hf_Bankruptcy | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
atRisk_gambler | 89 | 19 | 32 | 2 | 1 | 6 | 7 | 5 | 4 | 1 | 1 | 1 | 0 | 0 |
pathological_gambler | 36 | 25 | 25 | 9 | 4 | 17 | 14 | 22 | 9 | 5 | 2 | 2 | 3 | 9 |
problem_gambler | 9 | 0 | 7 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
recreational_gambler | 87 | 15 | 26 | 3 | 2 | 3 | 6 | 4 | 4 | 4 | 2 | 2 | 3 | 2 |
# book_data = read.csv('input/PCA_EXAMPLE/punctuation.csv', row=1)
# book_data
# library("FactoMineR")
# library("factoextra")
# # Graph of contingency tables
# library("gplots")
# # 1. convert the data as a table
# dt <- as.table(as.matrix(book_data))
# # 2. Graph
# balloonplot(t(dt), main ="book_data", xlab ="", ylab="",label = FALSE, show.margins = FALSE)
# # the projection of rows and columns
# res.ca <- CA(book_data, graph = TRUE)
# # print(res.ca)
wineRate_data_RAW = read.csv('input/PCA_EXAMPLE/mfa_wine.csv', row=1) %>% select(-oak_type)
wineRate_data_RAW
e1_fruity | e1_woody | e1_coffee | e2_redfruit | e2_roasted | e2_vanillin | e2_woody | e3_fruity | e3_butter | e3_woody | |
---|---|---|---|---|---|---|---|---|---|---|
<int> | <int> | <int> | <int> | <int> | <int> | <int> | <int> | <int> | <int> | |
wine_1 | 1 | 6 | 7 | 2 | 5 | 7 | 6 | 3 | 6 | 7 |
wine_2 | 5 | 3 | 2 | 4 | 4 | 4 | 2 | 4 | 4 | 3 |
wine_3 | 6 | 1 | 1 | 5 | 2 | 1 | 1 | 7 | 1 | 1 |
wine_4 | 7 | 1 | 2 | 7 | 2 | 1 | 2 | 2 | 2 | 2 |
wine_5 | 2 | 5 | 4 | 3 | 5 | 6 | 5 | 2 | 6 | 6 |
wine_6 | 3 | 4 | 4 | 3 | 5 | 4 | 5 | 1 | 7 | 5 |
wineRate_data = wineRate_data_RAW
pca_model_1 = prcomp(wineRate_data[1:3], center = TRUE, scale. = TRUE)
SV_1 = pca_model_1$sdev
pca_model_2 = prcomp(wineRate_data[4:7], center = TRUE, scale. = TRUE)
SV_2 = pca_model_2$sdev
pca_model_3 = prcomp(wineRate_data[8:10], center = TRUE, scale. = TRUE)
SV_3 = pca_model_3$sdev
SV_1[1]
SV_2[1]
SV_3[1]
# standardization + normalization
wineRate_data[1:3] = scale(wineRate_data[1:3])/SV_1[1]
wineRate_data[4:7] = scale(wineRate_data[4:7])/SV_2[1]
wineRate_data[8:10] = scale(wineRate_data[8:10])/SV_3[1]
wineRate_data
e1_fruity | e1_woody | e1_coffee | e2_redfruit | e2_roasted | e2_vanillin | e2_woody | e3_fruity | e3_butter | e3_woody | |
---|---|---|---|---|---|---|---|---|---|---|
<dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | |
wine_1 | -0.7492854 | 0.76303492 | 1.0032013 | -0.5851189 | 0.41480103 | 0.6673693 | 0.6309499 | -0.04951998 | 0.43690237 | 0.8049309 |
wine_2 | 0.2497618 | -0.09537937 | -0.3648005 | 0.0000000 | 0.05925729 | 0.0351247 | -0.3785699 | 0.24759990 | -0.08738047 | -0.2683103 |
wine_3 | 0.4995236 | -0.66765556 | -0.6384008 | 0.2925594 | -0.65183020 | -0.5971199 | -0.6309499 | 1.13895952 | -0.87380475 | -0.8049309 |
wine_4 | 0.7492854 | -0.66765556 | -0.3648005 | 0.8776783 | -0.65183020 | -0.5971199 | -0.3785699 | -0.34663985 | -0.61166332 | -0.5366206 |
wine_5 | -0.4995236 | 0.47689683 | 0.1824002 | -0.2925594 | 0.41480103 | 0.4566211 | 0.3785699 | -0.34663985 | 0.43690237 | 0.5366206 |
wine_6 | -0.2497618 | 0.19075873 | 0.1824002 | -0.2925594 | 0.41480103 | 0.0351247 | 0.3785699 | -0.64375973 | 0.69904380 | 0.2683103 |
wineRate_data$oak_type = c('oak_1','oak_2','oak_2','oak_2','oak_1','oak_1')
wineRate_data$oak_type = factor(wineRate_data$oak_type)
library(FactoMineR)
res.mfa <- MFA(wineRate_data,
group = c( 3, 4, 3,1),
type = c("s", "s", "s", "n"),
name.group = c("expert1", "expert2", "expert3",'oak_type'),
num.group.sup = 4,
graph = F)
get_eigenvalue(res.mfa)
fviz_screeplot(res.mfa)
eigenvalue | variance.percent | cumulative.variance.percent | |
---|---|---|---|
Dim.1 | 2.83480067 | 84.5450973 | 84.54510 |
Dim.2 | 0.35685905 | 10.6429645 | 95.18806 |
Dim.3 | 0.11536147 | 3.4405407 | 98.62860 |
Dim.4 | 0.03328555 | 0.9927083 | 99.62131 |
Dim.5 | 0.01269746 | 0.3786892 | 100.00000 |
fviz_mfa_ind(res.mfa, col.ind = "cos2",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE)
fviz_mfa_ind(res.mfa,
habillage = "oak_type", # color by groups
palette = c("#00AFBB", "#E7B800", "#FC4E07"),
addEllipses = TRUE, ellipse.type = "confidence",
repel = TRUE # Avoid text overlapping
)
fviz_mfa_var(res.mfa, "group")
# fviz_mfa_var(res.mfa, "quanti.var", palette = "jco",
# col.var.sup = "violet", repel = TRUE)
fviz_mfa_var(res.mfa, "quanti.var", palette = "jco",
col.var.sup = "violet", repel = TRUE,
geom = c("point", "text"), legend = "top")