update basic scripts

97bee072 · Liu, Meiqi · f58a89e9 · 97bee072 · 97bee072 · f58a89e9
Commit 97bee072 authored 11 months ago by Liu, Meiqi
--- a/.gitignore
+++ b/.gitignore
@@ -7,3 +7,4 @@ data/*.laccdb
 data/Query_Results.csv
 data/Table*.csv
 docs/*.csv
+graphs/*.png
--- a/docs/1_data_overview.R
+++ b/docs/1_data_overview.R
@@ -2,7 +2,7 @@ library(ProjectTemplate)
 load.project()

 # original data structure
-str(data_all)
+glimpse(data_all)

 # amount of data for each type
 str_type <- data_all %>% 

--- a/graphs/PFAS_sum.png
+++ b/graphs/PFAS_sum.png
--- a/graphs/README.md
+++ b/graphs/README.md
-Here you can store any graphs that you produce.
--- a/graphs/heatmap_detections.png
+++ b/graphs/heatmap_detections.png
--- a/graphs/sum_pfas.png
+++ b/graphs/sum_pfas.png
--- a/graphs/sum_pfas_v2.png
+++ b/graphs/sum_pfas_v2.png
--- a/graphs/sum_pfas_v3.png
+++ b/graphs/sum_pfas_v3.png
--- a/munge/2_read_metadata.R
+++ b/munge/2_read_metadata.R
@@ -61,12 +61,13 @@ metadata_pfas <- metadata_pfas %>%
 # extract level
 level_matrix <- c("Atmospheric Deposition", "Riverwater", "Groundwater",
                  "Leachate", "Surface Runoff", "Wastewater")
-level_type <- c("Atmospheric Deposition", "Danube Tributary", "Tributary Hotspot",
+level_type <- c("Atmospheric Deposition", "Danube Tributary", "Alz",
                "Danube Main", "Danube Bank-filtered Water", "Groundwater",
                "Groundwater directly influenced by Landfill", "Landfill leachate",
                "Surface Runoff", "Municipal Influent", "Municipal Effluent",
                "Industrial Influent", "Industrial Effluent")

 metadata_compartment <- metadata_compartment %>%
+  mutate(type = ifelse(type == "Tributary Hotspot", "Alz", type)) %>%
  mutate(matrix = factor(matrix, level = level_matrix),
         type = factor(type, level = level_type))
--- a/munge/3_data_preprocess.R
+++ b/munge/3_data_preprocess.R
@@ -7,6 +7,7 @@ data_all <- data_all %>%

 # ---------- factorise columns ----------
 data_all <- data_all %>%
+  mutate(type = ifelse(type == "Tributary Hotspot", "Alz", type)) %>%
  mutate(matrix = factor(matrix, level = level_matrix),
         type = factor(type, level = level_type),
         cas_number = factor(cas_number, level = level_casno),

--- a/src/1_detection_rates.R
+++ b/src/1_detection_rates.R
 ## statistical analysis on pfas detection
-
 library(ProjectTemplate)
 load.project()

 # ------------ detection across whole set ------
 ## Rule: across the whole dataset, detection rate
 ##  whole dataset 
-detection_wholeset <- mean(!data_all$value_below_loq) * 100
+#detection_wholeset <- 
+mean(!data_all$value_below_loq) * 100

 ## promisces data
-detection_promisces <- data_all %>%
+#detection_promisces <- 
+data_all %>%
  filter(datasource == "EU-PROMISCES") %>%
  summarise(detection_rate = mean(!value_below_loq) * 100) %>%
  pull(detection_rate)
@@ -17,14 +18,16 @@ detection_promisces <- data_all %>%
 # ---------- detection among samples -----
 ## Rule: for each unique sample, at least one PFAS is detected
 ## whole dataset
-detection_sample_wholeset <- data_all %>%
+#detection_sample_wholeset <- 
+data_all %>%
  group_by(sample) %>%
  summarise(at_least_one_detect = any(value_below_loq == FALSE)) %>%
  summarise(detection_rate = mean(at_least_one_detect) * 100) %>%
  pull(detection_rate)

 ## promisces data
-detection_sample_promisces <- data_all %>%
+#detection_sample_promisces <- 
+data_all %>%
  filter(datasource == "EU-PROMISCES") %>%
  group_by(sample) %>%
  summarise(at_least_one_detect = any(value_below_loq == FALSE)) %>%
@@ -54,56 +57,62 @@ detection_pfas_promisces <- data_all %>%

 # ------------- detection among pfas group -----
 ## Rule: across the whole dataset, detection rate for substance group
-detection_pfasgroup_wholeset <- data_all %>%
+#detection_pfasgroup_wholeset <- 
+data_all %>%
  left_join(metadata_pfas, by = "substance") %>%
  group_by(pfas_group_abb) %>%
  summarise(n_all = n(),
            detection_rate_all = mean(!value_below_loq) * 100) 

-detection_pfasgroup_promisces <- data_all %>%
+#detection_pfasgroup_promisces <- 
+data_all %>%
  filter(datasource == "EU-PROMISCES") %>%
  left_join(metadata_pfas, by = "substance") %>%
  group_by(pfas_group_abb) %>%
  summarise(n_promisces = n(),
            detection_rate_promisces = mean(!value_below_loq) * 100)

-detection_pfasgroup_combine <- detection_pfasgroup_wholeset %>%
-  left_join(detection_pfasgroup_promisces, by = "pfas_group_abb") %>%
-  select(1, 2, 4, 3, 5) %>%
-  arrange(desc(detection_rate_all))
+# detection_pfasgroup_combine <- detection_pfasgroup_wholeset %>%
+#   left_join(detection_pfasgroup_promisces, by = "pfas_group_abb") %>%
+#   select(1, 2, 4, 3, 5) %>%
+#   arrange(desc(detection_rate_all))

 # write.csv(detection_pfasgroup_combine, "docs/detection_pfas_groups.csv")

 # --------- detection among matrix ------
-detection_matrix_wholeset <- data_all %>%
+#detection_matrix_wholeset <- 
+data_all %>%
  group_by(matrix) %>%
  summarise(n_all = n(),
            detection_rate_all = mean(!value_below_loq) * 100) 

-detection_matrix_promisces <- data_all %>%
+#detection_matrix_promisces <- 
+data_all %>%
  filter(datasource == "EU-PROMISCES") %>%
  group_by(matrix) %>%
  summarise(n_promisces = n(),
            detection_rate_promisces = mean(!value_below_loq) * 100)

-detection_matrix_combine <- detection_matrix_wholeset %>%
-  left_join(detection_matrix_promisces, by = "matrix") 
+# detection_matrix_combine <- detection_matrix_wholeset %>%
+#   left_join(detection_matrix_promisces, by = "matrix") 

 # write.csv(detection_matrix_combine, "docs/detection_pfas_matrix.csv")

 # --------- detection among type ------
-detection_type_wholeset <- data_all %>%
+#detection_type_wholeset <- 
+data_all %>%
  group_by(type) %>%
  summarise(n_all = n(),
            detection_rate_all = mean(!value_below_loq) * 100) 

-detection_type_promisces <- data_all %>%
+#detection_type_promisces <- 
+data_all %>%
  filter(datasource == "EU-PROMISCES") %>%
  group_by(type) %>%
  summarise(n_promisces = n(),
            detection_rate_promisces = mean(!value_below_loq) * 100)

-detection_type_combine <- detection_type_wholeset %>%
-  left_join(detection_type_promisces, by = "type") 
+# detection_type_combine <- detection_type_wholeset %>%
+#   left_join(detection_type_promisces, by = "type") 

 # write.csv(detection_type_combine, "docs/detection_pfas_type.csv")
\ No newline at end of file
--- a/src/3_MFA.R
+++ b/src/3_MFA.R
-## Factor Analysis on Mixed Data (FAMD)
-library(ProjectTemplate)
-load.project()
-
-library(FactoMineR)
-library(Factoshiny)
-
-# -------- restructure the data for FAMD --------
-data_famd_raw <- data_ros %>%
-  filter(datasource == "EU-PROMISCES") %>%
-  select(code, matrix, type, substance, value) %>%
-  group_by(code, matrix, type, substance) %>%
-  filter(row_number() == 1) %>%
-  ungroup() %>%
-  pivot_wider(names_from = substance, values_from = value)
-
-colSums(is.na(data_famd_raw))
-# data_famd_raw %>% filter(if_any(4:35, is.na))
-
-# fill NAs with half_loq
-data_loq <- data_ros %>%
-    filter(datasource == "EU-PROMISCES") %>%
-    filter(value_below_loq == TRUE) %>%
-    group_by(substance, type) %>%
-    summarise(loq = mean(loq), .groups = "drop") 
-
-data_famd_long <- data_famd_raw %>%
-  pivot_longer(cols = 4:35, names_to = "substance", values_to = "value") %>%
-  left_join(data_loq, by = c("substance", "type")) %>%
-  mutate(value = ifelse(is.na(value), loq / 2, value)) %>%
-  select(-loq)
-
-data_famd <- data_famd_long %>%
-  pivot_wider(names_from = substance, values_from = value) %>%
-  select(-code)
-
-Factoshiny(data_famd)
-
-# -------- perform MFA ------
-res.FAMD<-FAMD(data_famd,graph=FALSE)
-plot.FAMD(res.FAMD,invisible=c('ind.sup'),title="Graph of individuals and categories")
-plot.FAMD(res.FAMD,axes=c(1,2),choix='var',title="Graph of the variables")
-plot.FAMD(res.FAMD, choix='quanti',title="Correlation circle")
-
-
-
-summary(res.FAMD)
-