1. correlation PCA¶

In [178]:
library(dplyr)
library(tidyverse)
library(tibble)

wine_data = read.csv('input/PCA_EXAMPLE/WINE.csv', row=1)
wine_data

pca_model = prcomp(wine_data, scale = TRUE)
A data.frame: 5 × 7
hedonicmeatdssertpricesugaralcoholacidity
<int><int><int><int><int><int><int>
wine_11478 77137
wine_21076 43147
wine_3 855105125
wine_4 247167113
wine_5 624133103
In [179]:
# eigenvalues are the variances of PCs
eigenvalues = (pca_model$sdev)^2
pc_names = colnames(pca_model$x)
df_scree = data.frame(pc_names, eigenvalues)
ggplot(data = df_scree, aes(x=pc_names, y=eigenvalues)) + geom_bar(stat="identity")
In [180]:
# loading matrix Q (squared elements of each PC sum to 1), this is different from the correlations between variable and PC
loading_Q = pca_model$rotation
loading_Q

# loading - the correlations between variable and PC
# PC_scores = pca_model$x      # same as: scale(wine_data) %*% loading_Q[,1:2]
# df_temp = cbind(PC_scores, wine_data)
# loading_correlation = cor(df_temp)[rownames(loading_Q), colnames(loading_Q)]
# loading_correlation
A matrix: 7 × 5 of type dbl
PC1PC2PC3PC4PC5
hedonic-0.39649537-0.11492093 0.802467762 0.05192087 1.628461e-01
meat-0.44544114 0.10904271-0.281059542-0.27447822 6.783554e-01
dssert-0.26456063 0.58542927-0.096071446 0.76029634-1.360023e-15
price 0.41597371 0.31111971 0.007335518-0.09388916 5.675693e-01
sugar-0.04852953 0.72445063 0.216110570-0.54740702-3.393711e-01
alcohol-0.43850226-0.05545248-0.465755989-0.16874049-2.753171e-01
acidity-0.45465130-0.08646543 0.064304541-0.08350114 1.441867e-02
In [181]:
colSums(loading_Q ^ 2)
rowSums(loading_Q ^ 2)
PC1
1
PC2
1
PC3
1
PC4
1
PC5
1
hedonic
0.84358454061835
meat
0.824806941609728
dssert
1
price
0.600833542942357
sugar
0.98871478495513
alcohol
0.516560701783608
acidity
0.225499488090828
In [182]:
# before rotation
library(ggforce)
corr_circle = as.data.frame(loading_Q) %>% tibble::rownames_to_column('features') %>% select(features, PC1, PC2)
# corr_circle
p1 = ggplot(data = corr_circle, aes(x = PC1, y=PC2, label=features)) + 
        geom_point() + geom_text(hjust=0, vjust=0) + xlim(-1, 1) + ylim(-1,1) +
        geom_circle(aes(x0 =0, y0 = 0, r = 1)) 
p1

# after rotation
varimax_Q <- varimax(loading_Q[,1:2])
loading_Q_varimax = loading_Q[,1:2] %*% varimax_Q$rotmat
colnames(loading_Q_varimax) = c('PC1', 'PC2')
loading_Q_varimax = as.data.frame(loading_Q_varimax) %>% tibble::rownames_to_column('features')

p2 = ggplot(data = loading_Q_varimax, aes(x = PC1, y=PC2, label=features)) + 
        geom_point() + geom_text(hjust=0, vjust=0) + xlim(-1, 1) + ylim(-1,1) +
        geom_circle(aes(x0 =0, y0 = 0, r = 1)) 
p2
In [183]:
print('before rotation')
loading_Q[,1:2]
print('after rotation')
loading_Q_varimax
[1] "before rotation"
A matrix: 7 × 2 of type dbl
PC1PC2
hedonic-0.39649537-0.11492093
meat-0.44544114 0.10904271
dssert-0.26456063 0.58542927
price 0.41597371 0.31111971
sugar-0.04852953 0.72445063
alcohol-0.43850226-0.05545248
acidity-0.45465130-0.08646543
[1] "after rotation"
A data.frame: 7 × 3
featuresPC1PC2
<chr><dbl><dbl>
hedonic-0.4122397-0.02176773
meat -0.4089853 0.20745881
dssert -0.1245344 0.63024672
price 0.4758140 0.20840020
sugar 0.1174461 0.71651255
alcohol-0.4396263 0.04569372
acidity-0.4624033 0.01916442
In [175]:
# # after rotation
# varimax_Q <- varimax(loading_Q[,1:2])
# loading_Q_varimax = loading_Q[,1:2] %*% varimax_Q$rotmat

# colnames(loading_Q_varimax) = c('PC1', 'PC2')
# new_PC_score = scale(wine_data) %*% loading_Q_varimax
# df_temp = cbind(new_PC_score, wine_data)
# new_loading_correlation = cor(df_temp)[rownames(loading_Q_varimax), colnames(loading_Q_varimax)]
# new_loading_correlation

# library(ggforce)
# corr_circle = as.data.frame(new_loading_correlation) %>% tibble::rownames_to_column('features') %>% select(features, PC1, PC2)
# p2 = ggplot(data = corr_circle, aes(x = PC1, y=PC2, label=features)) + 
#         geom_point() + geom_text(hjust=0, vjust=0) + xlim(-1, 1) + ylim(-1,1) +
#         geom_circle(aes(x0 =0, y0 = 0, r = 1)) 
# p2

2. covariance PCA¶

In [184]:
food_data = read.csv('input/PCA_EXAMPLE/Francs.csv', row=1)
food_data
pca_model = prcomp(food_data, center = TRUE, scale. = FALSE)
A data.frame: 12 × 7
breadvegetablesfruitmeatpoultrymilkwine
<int><int><int><int><int><int><int>
BC2332 4283541437 526247427
WC2293 5593881527 567239258
UC2372 7675621948 927235433
BC3406 5633411507 544324407
WC3386 6083961501 558319363
UC3438 84368923451148243341
BC4534 6603671620 638414407
WC4460 6994841856 762400416
UC4385 78962123661149304282
BC5655 7764231848 759495486
WC5584 9955482056 893518319
UC5515109788726301167561284
In [185]:
# eigenvalues are the variances of PCs
eigenvalues = (pca_model$sdev)^2
variance_explained = eigenvalues/ sum(eigenvalues)
pc_names = colnames(pca_model$x)
df_scree = data.frame(pc_names, variance_explained)
ggplot(data = df_scree, aes(x=pc_names, y=variance_explained)) + geom_bar(stat="identity")
In [186]:
# loading matrix Q (squared elements of each PC sum to 1), this is different from the correlations between variable and PC
loading_Q = pca_model$rotation
# loading_Q

# loading - the correlations between variable and PC
PC_scores = pca_model$x      # same as: scale(wine_data) %*% loading_Q[,1:2]
df_temp = cbind(PC_scores, food_data)
loading_correlation = cor(df_temp)[rownames(loading_Q), colnames(loading_Q)]
# loading_correlation
In [188]:
rowSums(loading_Q ^ 2)
colSums(loading_Q ^ 2)
# rowSums(loading_correlation ^ 2)
bread
1
vegetables
1
fruit
1
meat
0.999999999999999
poultry
1
milk
1
wine
1
PC1
0.999999999999999
PC2
1
PC3
1
PC4
1
PC5
1
PC6
1
PC7
0.999999999999999
In [189]:
df_cases = as.data.frame(PC_scores[,1:2]) %>% 
            tibble::rownames_to_column('individuals') %>% 
            mutate(class = str_sub(individuals,1,2))
df_cases$class = factor(df_cases$class)
p1 = ggplot(data = df_cases, aes(x = PC1, y=PC2, label=individuals, color = class)) + 
        geom_point() + geom_text(hjust=0, vjust=0) + xlim(-1000, 1000) + ylim(-1000,1000)

p1
In [192]:
# # before rotation
# library(ggforce)
# corr_circle = as.data.frame(loading_Q) %>% tibble::rownames_to_column('features') %>% select(features, PC1, PC2)
# corr_circle
# p1 = ggplot(data = corr_circle, aes(x = PC1, y=PC2, label=features)) + 
#         geom_point() + geom_text(nudge_x =-0.1,nudge_y =0.1) + xlim(-1, 1) + ylim(-1,1) +
#         geom_circle(aes(x0 =0, y0 = 0, r = 1)) 
# p1
In [312]:
# df_biplot = rbind(as.data.frame(loading_correlation)[1:2], as.data.frame(PC_scores[,1:2]))
# df_biplot$label = rep(c('variables','cases'), times=c(7,12))
# df_biplot$label = factor(df_biplot$label)
# df_biplot = df_biplot %>% tibble::rownames_to_column('names')

p_bi = biplot(pca_model)
p_bi
NULL

3. correspondence analysis¶

In [193]:
# different diagnostic label
who_gambled_2016 = read.csv('output/who_gambled_2016.csv', row =1)
who_gambled_2016 = who_gambled_2016$subjectkey 
print(paste('who_gambled_2016:',length(who_gambled_2016)))

label = read.csv('output/year2016_gambler_PPGM_Label.csv', row =1) %>% select(subjectkey, diagnosis_ppgm)

data_2016 = read.csv('output/HarmSurvey2016.csv', row =1)

dim(data_2016)

gambler_data_2016 = data_2016 %>% filter(subjectkey %in% who_gambled_2016)
harm_data_temp = gambler_data_2016 %>% dplyr::select(subjectkey,starts_with('h')) 
harm_data_temp = merge(harm_data_temp, label, by = 'subjectkey')

harm_data = harm_data_temp %>% tibble::column_to_rownames(var = 'subjectkey')
dim(harm_data)
head(harm_data,2)
[1] "who_gambled_2016: 5461"
  1. 7186
  2. 204
  1. 3068
  2. 73
A data.frame: 2 × 73
hf_reduction_Available_moneyhf_reduction_Savinghf_reduction_recreation_expenseshf_Increased_credit_card_debthf_Sold_personal_itemshf_reduction_essential_expenseshf_reduction_beneficial_expenseshf_late_bill_paymentshf_additional_employmenthf_need_welfare_organizations⋯ho_violenceho_Petty_theft_dishonestyho_children_neglectedho_less_connected_communityho_outcast_communityho_Reduced_contribution_communityho_crimeho_fake_promise_paybackho_steal_ownhomediagnosis_ppgm
<int><int><int><int><int><int><int><int><int><int>⋯<int><int><int><int><int><int><int><int><int><chr>
10000000000⋯000000000recreational_gambler
20000000000⋯000000000recreational_gambler
In [230]:
df=harm_data%>% select(starts_with('hf'),diagnosis_ppgm)
table = sapply(df[ ,1:ncol(df)-1], function(x) tapply(x, df[ ,ncol(df)], sum))
table  
table_centered = scale( table, center = TRUE, scale = FALSE)           
# table_normalized = table
# for (i in 1:ncol(table)){
#     table_normalized[,i] = table[,i] / colSums(table)[[i]] 
#     table_normalized = scale( table_normalized, center = TRUE, scale = FALSE)
#     }
# table_normalized
res.ca <- CA(table, graph = TRUE)
A matrix: 4 × 14 of type int
hf_reduction_Available_moneyhf_reduction_Savinghf_reduction_recreation_expenseshf_Increased_credit_card_debthf_Sold_personal_itemshf_reduction_essential_expenseshf_reduction_beneficial_expenseshf_late_bill_paymentshf_additional_employmenthf_need_welfare_organizationshf_need_temporary_accommodationhf_Loss_assetshf_Loss_utility_supplyhf_Bankruptcy
atRisk_gambler89193221 6 7 5411100
pathological_gambler36252594171422952239
problem_gambler 9 0 700 2 0 0000000
recreational_gambler87152632 3 6 4442232
In [231]:
# book_data = read.csv('input/PCA_EXAMPLE/punctuation.csv', row=1)
# book_data

# library("FactoMineR")
# library("factoextra")

# # Graph of contingency tables
# library("gplots")
# # 1. convert the data as a table
# dt <- as.table(as.matrix(book_data))
# # 2. Graph
# balloonplot(t(dt), main ="book_data", xlab ="", ylab="",label = FALSE, show.margins = FALSE)


# # the projection of rows and columns
# res.ca <- CA(book_data, graph = TRUE)
# # print(res.ca)

4. multiple factor analysis¶

In [232]:
wineRate_data_RAW = read.csv('input/PCA_EXAMPLE/mfa_wine.csv', row=1) %>% select(-oak_type)
wineRate_data_RAW
A data.frame: 6 × 10
e1_fruitye1_woodye1_coffeee2_redfruite2_roastede2_vanilline2_woodye3_fruitye3_buttere3_woody
<int><int><int><int><int><int><int><int><int><int>
wine_11672576367
wine_25324442443
wine_36115211711
wine_47127212222
wine_52543565266
wine_63443545175
In [257]:
wineRate_data = wineRate_data_RAW
pca_model_1 = prcomp(wineRate_data[1:3], center = TRUE, scale. = TRUE)
SV_1 = pca_model_1$sdev
pca_model_2 = prcomp(wineRate_data[4:7], center = TRUE, scale. = TRUE)
SV_2 = pca_model_2$sdev
pca_model_3 = prcomp(wineRate_data[8:10], center = TRUE, scale. = TRUE)
SV_3 = pca_model_3$sdev
SV_1[1]
SV_2[1]
SV_3[1]

# standardization + normalization
wineRate_data[1:3] = scale(wineRate_data[1:3])/SV_1[1]
wineRate_data[4:7] = scale(wineRate_data[4:7])/SV_2[1]
wineRate_data[8:10] = scale(wineRate_data[8:10])/SV_3[1]
wineRate_data
1.6919205250977
1.91078097498053
1.57495677842037
A data.frame: 6 × 10
e1_fruitye1_woodye1_coffeee2_redfruite2_roastede2_vanilline2_woodye3_fruitye3_buttere3_woody
<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
wine_1-0.7492854 0.76303492 1.0032013-0.5851189 0.41480103 0.6673693 0.6309499-0.04951998 0.43690237 0.8049309
wine_2 0.2497618-0.09537937-0.3648005 0.0000000 0.05925729 0.0351247-0.3785699 0.24759990-0.08738047-0.2683103
wine_3 0.4995236-0.66765556-0.6384008 0.2925594-0.65183020-0.5971199-0.6309499 1.13895952-0.87380475-0.8049309
wine_4 0.7492854-0.66765556-0.3648005 0.8776783-0.65183020-0.5971199-0.3785699-0.34663985-0.61166332-0.5366206
wine_5-0.4995236 0.47689683 0.1824002-0.2925594 0.41480103 0.4566211 0.3785699-0.34663985 0.43690237 0.5366206
wine_6-0.2497618 0.19075873 0.1824002-0.2925594 0.41480103 0.0351247 0.3785699-0.64375973 0.69904380 0.2683103
In [259]:
wineRate_data$oak_type = c('oak_1','oak_2','oak_2','oak_2','oak_1','oak_1')
wineRate_data$oak_type = factor(wineRate_data$oak_type)

library(FactoMineR)
res.mfa <- MFA(wineRate_data, 
               group = c( 3, 4, 3,1), 
               type = c("s", "s", "s", "n"),
               name.group = c("expert1", "expert2", "expert3",'oak_type'),
               num.group.sup = 4,
               graph = F)
In [260]:
get_eigenvalue(res.mfa)

fviz_screeplot(res.mfa)
A matrix: 5 × 3 of type dbl
eigenvaluevariance.percentcumulative.variance.percent
Dim.12.8348006784.5450973 84.54510
Dim.20.3568590510.6429645 95.18806
Dim.30.11536147 3.4405407 98.62860
Dim.40.03328555 0.9927083 99.62131
Dim.50.01269746 0.3786892100.00000
In [261]:
fviz_mfa_ind(res.mfa, col.ind = "cos2", 
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE)
In [263]:
fviz_mfa_ind(res.mfa, 
             habillage = "oak_type", # color by groups 
             palette = c("#00AFBB", "#E7B800", "#FC4E07"),
             addEllipses = TRUE, ellipse.type = "confidence", 
             repel = TRUE # Avoid text overlapping
             )
In [262]:
fviz_mfa_var(res.mfa, "group")
In [264]:
# fviz_mfa_var(res.mfa, "quanti.var", palette = "jco", 
#              col.var.sup = "violet", repel = TRUE)

fviz_mfa_var(res.mfa, "quanti.var", palette = "jco", 
             col.var.sup = "violet", repel = TRUE,
             geom = c("point", "text"), legend = "top")
In [ ]: