Phenotype for LMM

#columns to extract

library(xlsx)
xl <- read.xlsx(file = "/Users/elise.delzant/Documents/01GM_Association/Phenotypes/ukb_pheno.xlsx",
    sheetIndex = 1)
df_xl <- as.data.frame(xl)
extract_feat <- df_xl$UKB_extract_field
chr_feat <- c(1)
for (elem in extract_feat) {
    chr_feat <- paste(chr_feat, elem, sep = ",")
}
chr_feat

#extract .tab from ukb with our ID of interest

#extract ethnicity

wd="/network/iss/ukbiobank/unorganized/unorganized_pheno"
od="/network/iss/ukbiobank/clinic/pheno/organized_pheno"
bind="/network/iss/ukbiobank/software" 
cd $od

${bind}/qsubshcom "cut -f1,1102,11103,11104,11105 ${wd}/ukb673035.tab > ${od}/ukb673035_UKbiobank_ED_Ethnicity.tab|;
" 1 10G UKB_pheno_extract 24:00:00 " "

#keep only subjects that have a T1 and separate into T1 20252 and T1 20263, and participants who want to be rm

#birth date and year

# Field 31
library(dplyr)
library(lubridate)
pheno$Sexe <- as.factor(pheno$Sexe)
levels(pheno$Sexe) <- c("Female", "Male")

# Field 11093
pheno$Assessment_center_age <- as.numeric(pheno$Assessment_center_age)

# Add birth_date
pheno$Birth_year <- as.numeric(pheno$Birth_year)
pheno$Birth_month <- as.numeric(pheno$Birth_month)
pheno$Birth_date <- make_date(year = pheno$Birth_year, month = pheno$Birth_month,
    day = 1)

#Create Total Brain Volume feature #Fields 26556 26587 26553 26584

missings <- pheno[is.na(pheno$GMV) | is.na(pheno$Cerebral_WMV_l) |
    is.na(pheno$Cerebral_WMV_r) | is.na(pheno$Cerebell_WMV_l) |
    is.na(pheno$Cerebell_WMV_r), ]
pheno <- pheno[!(pheno$ID %in% missings$ID), ]
pheno$TBV <- as.numeric(pheno$GMV) + as.numeric(pheno$Cerebral_WMV_l) +
    as.numeric(pheno$Cerebral_WMV_r) + as.numeric(pheno$Cerebell_WMV_l) +
    as.numeric(pheno$Cerebell_WMV_r)

#Depression and bipolar status with field 20126

pheno$Dep_status <- as.factor(pheno$Dep_status)
levels(pheno$Dep_status) <- c("No Bipolar or depression", "Bipolar I",
    "Bipolar II", "Major depression (severe)", "Major depression (moderate)",
    "Single major depression episod")
pheno$Depression <- as.factor(ifelse(pheno$Dep_status %in% c("No Bipolar or depression",
    "Bipolar I", "Bipolar II"), "No depression", pheno$Dep_status))
levels(pheno$Depression) <- c("Major depression (severe)", "Major depression (moderate)",
    "Single major depression episod", "No depression")

#Excluding former drinkers from frequency drinking alcohol

pheno[which(pheno$Alcohol_freq == "-818"), "Alcohol_freq"] <- NA

pheno[which(is.na(pheno$Alcohol_ever_dependent)), "Alcohol_ever_dependent"] <- 2  #new value that indicates NA stands for not ever having been addicted 
pheno[which(pheno$Alcohol_ever_dependent == "-818"), "Alcohol_freq"] <- NA
pheno[which(pheno$Alcohol_ever_dependent == "-121"), "Alcohol_freq"] <- NA

pheno[pheno$Alcohol_ever_dependent == 1 & !is.na(pheno$Alcohol_ever_dependent),
    "Alcohol_freq"] <- NA  #in Alcohol freq we want only people that have not been formerly addicted 
pheno$Alcohol_freq <- as.factor(pheno$Alcohol_freq)
levels(pheno$Alcohol_freq) <- c("Never", "Monthly or less", "2-4 t./month",
    "2-3t./week", "4+t./week")

#stroke, alzheimer and parkinson

pheno$Stroke <- as.factor(ifelse(is.na(pheno$Stroke_date), "No",
    "Yes"))
pheno$Stroke_age <- as.numeric(as.Date(pheno$Stroke_date) - pheno$Birth_date)/365.25
# 5 subjects with Stroke date 01/01/1900
pheno$Alzheimer <- as.factor(ifelse(is.na(pheno$Alzheimer_date),
    "No", "Yes"))
pheno$Alzheimer_age <- as.numeric(as.Date(pheno$Alzheimer_date) -
    pheno$Birth_date)/365.25
pheno$Parkinson <- as.factor(ifelse(is.na(pheno$Parkinson_date),
    "No", "Yes"))
pheno$Parkinson_age <- as.numeric(as.Date(pheno$Parkinson_date) -
    pheno$Birth_date)/365.25


# get infos on when alzheimer, how many years after imaging
alz <- pheno[pheno$Alzheimer == "Yes", ]
alz <- alz[, c("Alzheimer_date", "Date_center_V2")]
alz$Diff <- as.Date(alz$Alzheimer_date) - as.Date(alz$Date_center_V2)
summary(as.numeric(alz$Diff))

# get infos on when Parkinson, how many years after imaging
alz <- pheno[pheno$Parkinson == "Yes", ]
alz <- alz[, c("Parkinson_date", "Date_center_V2")]
alz$Diff <- as.Date(alz$Parkinson_date) - as.Date(alz$Date_center_V2)
summary(as.numeric(alz$Diff))

#merging visit 0, 1 and 2

# Field 21001
pheno$BMI <- as.numeric(ifelse(is.na(pheno$BMI_V2), ifelse(is.na(pheno$BMI_V1),
    pheno$BMI_V0, pheno$BMI_V1), pheno$BMI_V2))

# Field 845 value -2 equals never went to school
pheno[which(pheno$Education_age_V2 %in% c("-1", "-3")), "Education_age_V2"] <- NA
pheno[which(pheno$Education_age_V1 %in% c("-1", "-3")), "Education_age_V1"] <- NA
pheno[which(pheno$Education_age_V0 %in% c("-1", "-3")), "Education_age_V0"] <- NA
pheno$Education_age <- as.numeric(ifelse(is.na(pheno$Education_age_V2),
    ifelse(is.na(pheno$Education_age_V1), pheno$Education_age_V0,
        pheno$Education_age_V1), pheno$Education_age_V2))
pheno[which(pheno$Education_age == -2), "Education_age"] <- 0
# Field 1200
pheno[which(pheno$Sleeplessness_V2 == "-3"), "Sleeplessness_V2"] <- NA
pheno[which(pheno$Sleeplessness_V1 == "-3"), "Sleeplessness_V1"] <- NA
pheno[which(pheno$Sleeplessness_V0 == "-3"), "Sleeplessness_V0"] <- NA
pheno$Sleeplessness <- as.factor(ifelse(is.na(pheno$Sleeplessness_V2),
    ifelse(is.na(pheno$Sleeplessness_V1), pheno$Sleeplessness_V0,
        pheno$Sleeplessness_V1), pheno$Sleeplessness_V2))

levels(pheno$Sleeplessness) <- c("Never/rarely", "Sometimes",
    "Usually")

# 1 Never/rarely 2 Sometimes 3 Usually

# set 'prefer not to answer to NA
for (i in 0:2) {
    var_name <- paste0("Tobacco_current_V", i)
    pheno[which(pheno[, var_name] == "-3"), var_name] <- NA

    var_name <- paste0("Tobacco_past_V", i)
    pheno[which(pheno[, var_name] == "-3"), var_name] <- NA
}

# Field 1239 and 1249 new var smoking
pheno$smoking <- NA
pheno[which(pheno$Tobacco_past_V0 == "2" | pheno$Tobacco_past_V0 ==
    "4" | pheno$Tobacco_past_V0 == "3" & pheno$Tobacco_past_V1 ==
    "2" | pheno$Tobacco_past_V1 == "4" | pheno$Tobacco_past_V1 ==
    "3" & pheno$Tobacco_past_V2 == "2" | pheno$Tobacco_past_V2 ==
    "4" | pheno$Tobacco_past_V2 == "3" & pheno$Tobacco_current_V0 ==
    "1" | pheno$Tobacco_current_V0 == "0" & pheno$Tobacco_current_V1 ==
    "1" | pheno$Tobacco_current_V1 == "0" & pheno$Tobacco_current_V2 ==
    "1" | pheno$Tobacco_current_V2 == "0"), "smoking"] <- "Occasionnally"

pheno[which(pheno$Tobacco_current_V0 == "0" & pheno$Tobacco_current_V1 ==
    "0" & pheno$Tobacco_current_V2 == "0" & pheno$Tobacco_past_V0 ==
    "4" & pheno$Tobacco_past_V1 == "4" & pheno$Tobacco_past_V2 ==
    "4"), "smoking"] <- "Never_smoked"
pheno[which(pheno$Tobacco_past_V0 == "3" | pheno$Tobacco_past_V1 ==
    "3" | pheno$Tobacco_past_V2 == "3" & pheno$Tobacco_current_V0 ==
    "0" & pheno$Tobacco_current_V1 == "0" & pheno$Tobacco_current_V1 ==
    "0"), "smoking"] <- "Tried"
pheno[which(pheno$Tobacco_past_V0 == "1" | pheno$Tobacco_past_V1 ==
    "1" | pheno$Tobacco_past_V2 == "1" | pheno$Tobacco_current_V0 ==
    "2" | pheno$Tobacco_current_V1 == "2" | pheno$Tobacco_current_V2 ==
    "2"), "smoking"] <- "Ever_regularly"
pheno[which(is.na(pheno$smoking) & pheno$Tobacco_current_V2 ==
    "1"), "smoking"] <- "Occasionnally"
pheno[which(is.na(pheno$smoking) & pheno$Tobacco_past_V2 == "3"),
    "smoking"] <- "Tried"

# get an average of smoked cigarettes accross three time
# points, -10 means less than one cigarette
pheno$Cig_daily_V1 <- as.numeric(pheno$Cig_daily_V1)
pheno[which(pheno$Cig_daily_V1 == -1), "Cig_daily_V1"] <- NA
pheno[which(pheno$Cig_daily_V1 == -10), "Cig_daily_V1"] <- 1

pheno$Cig_daily_V2 <- as.numeric(pheno$Cig_daily_V2)
pheno[which(pheno$Cig_daily_V2 == -1), "Cig_daily_V2"] <- NA
pheno[which(pheno$Cig_daily_V2 == -10), "Cig_daily_V2"] <- 1

pheno$Cig_daily_V3 <- as.numeric(pheno$Cig_daily_V3)
pheno[which(pheno$Cig_daily_V3 == -1), "Cig_daily_V3"] <- NA
pheno[which(pheno$Cig_daily_V3 == -10), "Cig_daily_V3"] <- 1


pheno$cigarettes_daily <- rowMeans(pheno[, c("Cig_daily_V1",
    "Cig_daily_V2", "Cig_daily_V3")])


# check that non smokers and 'tried' have not answer this
# question

if (nrow(pheno[which(pheno$smoking == "Tried" & !is.na(pheno$cigarettes_daily)),
    ]) != 0) {
    print("We have someone who's only tried a cigarette once or twice who indicates amounts of cigarettes smoked")
    break
}

if (nrow(pheno[which(pheno$smoking == "Never_smoked" & !is.na(pheno$cigarettes_daily)),
    ]) != 0) {
    print("We have someone who's never smoked who indicates amounts of cigarettes smoked")
    break
}


# check that the ones who didn't answer the daily
# cigarettes question are only occasional smokers
nrow(pheno[which(pheno$smoking == "Occasionally" & !is.na(pheno$cigarettes_daily)),
    ])  # none either 

# split regular smokers into quantiles based on their daily
# cigarettes indication
pheno$smoking[which(pheno$cigarettes_daily <= quantile(pheno$cigarettes_daily,
    0.25, na.rm = T))] <- "Regular_bottomquantile"
pheno$smoking[which(pheno$cigarettes_daily > quantile(pheno$cigarettes_daily,
    0.25, na.rm = T) & pheno$cigarettes_daily <= quantile(pheno$cigarettes_daily,
    0.5, na.rm = T))] <- "Regular_2ndquantile"
pheno$smoking[which(pheno$cigarettes_daily > quantile(pheno$cigarettes_daily,
    0.5, na.rm = T) & pheno$cigarettes_daily <= quantile(pheno$cigarettes_daily,
    0.75, na.rm = T))] <- "Regular_3rdquantile"
pheno$smoking[which(pheno$cigarettes_daily > quantile(pheno$cigarettes_daily,
    0.75, na.rm = T) & pheno$cigarettes_daily <= quantile(pheno$cigarettes_daily,
    1, na.rm = T))] <- "Regular_4thquantile"

# If I define it this way I will have to get rid of the
# regular smokers who didn't indicate how many cigarettes
# they use :(
pheno$smoking <- ifelse(pheno$smoking == "Ever_regularly" & is.na(pheno$cigarettes_daily),
    NA, pheno$smoking)

pheno$smoking <- factor(pheno$smoking, levels = c("Never_smoked",
    "Tried", "Occasionnally", "Regular_bottomquantile", "Regular_2ndquantile",
    "Regular_3rdquantile", "Regular_4thquantile"))
# Field 1787
pheno[which(pheno$Maternal_smoking_V2 %in% c("-1", "-3")), "Maternal_smoking_V2"] <- NA
pheno[which(pheno$Maternal_smoking_V1 %in% c("-1", "-3")), "Maternal_smoking_V1"] <- NA
pheno[which(pheno$Maternal_smoking_V0 %in% c("-1", "-3")), "Maternal_smoking_V0"] <- NA

pheno$Maternal_smoking <- as.factor(ifelse(is.na(pheno$Maternal_smoking_V2),
    ifelse(is.na(pheno$Maternal_smoking_V1), pheno$Maternal_smoking_V0,
        pheno$Maternal_smoking_V1), pheno$Maternal_smoking_V2))
levels(pheno$Maternal_smoking) <- c("No", "Yes")
# Field 2976
pheno[which(pheno$Diabetes_V2 %in% c("-1", "-3")), "Diabetes_V2"] <- NA
pheno[which(pheno$Diabetes_V1 %in% c("-1", "-3")), "Diabetes_V1"] <- NA
pheno[which(pheno$Diabetes_V0 %in% c("-1", "-3")), "Diabetes_V0"] <- NA

pheno$Diabetes <- as.factor(ifelse(is.na(pheno$Diabetes_V2),
    ifelse(is.na(pheno$Diabetes_V1), pheno$Diabetes_V0, pheno$Diabetes_V1),
    pheno$Diabetes_V2))
levels(pheno$Diabetes) <- c("No", "Yes")
# Field 2966
pheno[which(pheno$High_blood_pressure_age_V2 %in% c("-1", "-3")),
    "High_blood_pressure_age_V2"] <- NA
pheno[which(pheno$High_blood_pressure_age_V1 %in% c("-1", "-3")),
    "High_blood_pressure_age_V1"] <- NA
pheno[which(pheno$High_blood_pressure_age_V0 %in% c("-1", "-3")),
    "High_blood_pressure_age_V0"] <- NA

pheno$High_blood_pressure_age <- as.numeric(ifelse(is.na(pheno$High_blood_pressure_age_V2),
    ifelse(is.na(pheno$High_blood_pressure_age_V1), pheno$High_blood_pressure_age_V0,
        pheno$High_blood_pressure_age_V1), pheno$High_blood_pressure_age_V2))

# new variable high blood pressure as category Yes No
pheno$High_blood_pressure <- as.factor(ifelse(is.na(pheno$High_blood_pressure_age),
    "No", "Yes"))

# Field 2976
pheno$Diabetes_age <- as.numeric(ifelse(is.na(pheno$Diabetes_age_V2),
    ifelse(is.na(pheno$Diabetes_age_V1), pheno$Diabetes_age_V0,
        pheno$Diabetes_age_V1), pheno$Diabetes_age_V2))

pheno[which(pheno$Diabetes_age %in% c(-1, -3)), "Diabetes_age"] <- NA

# Field 4803
pheno[which(pheno$Tinnitus_V2 == "-1" | pheno$Tinnitus_V2 ==
    "-3"), "Tinnitus_V2"] <- NA
pheno[which(pheno$Tinnitus_V1 == "-1" | pheno$Tinnitus_V1 ==
    "-3"), "Tinnitus_V1"] <- NA
pheno[which(pheno$Tinnitus_V0 == "-1" | pheno$Tinnitus_V0 ==
    "-3"), "Tinnitus_V0"] <- NA

pheno$Tinnitus <- as.factor(ifelse(is.na(pheno$Tinnitus_V2),
    ifelse(is.na(pheno$Tinnitus_V1), pheno$Tinnitus_V0, pheno$Tinnitus_V1),
    pheno$Tinnitus_V2))
levels(pheno$Tinnitus) <- c(0, 1, 2, 3, 4, NA)
levels(pheno$Tinnitus) <- c("Never", "Most or all the time",
    "A lot of the time", "Some of the time", "Not now but in the past")

# field 2754 age at first birth
pheno[which(pheno$First_birth_V1 %in% c("-3", "-4")), "First_birth_V1"] <- NA
pheno[which(pheno$First_birth_V2 %in% c("-3", "-4")), "First_birth_V2"] <- NA
pheno[which(pheno$First_birth_V3 %in% c("-3", "-4")), "First_birth_V3"] <- NA

pheno$First_birth_age <- as.numeric(ifelse(is.na(pheno$First_birth_V3),
    ifelse(is.na(pheno$First_birth_V2), pheno$First_birth_V3,
        pheno$First_birth_V2), pheno$First_birth_V3))

# Hip circumference
pheno$Hip_circumf <- as.numeric(ifelse(is.na(pheno$Hip_circumf_V2),
    ifelse(is.na(pheno$Hip_circumf_V1), pheno$Hip_circumf_V0,
        pheno$Hip_circumf_V1), pheno$Hip_circumf_V2))


# Waist circumference
pheno$Waist_circumf <- as.numeric(ifelse(is.na(pheno$Waist_circumf_V2),
    ifelse(is.na(pheno$Waist_circumf_V1), pheno$Waist_circumf_V0,
        pheno$Waist_circumf_V1), pheno$Waist_circumf_V2))

# Nb children (male only)
pheno[which(pheno$Nb_children_V2 %in% c("-1", "-3")), "Nb_children_V2"] <- NA
pheno[which(pheno$Nb_children_V1 %in% c("-1", "-3")), "Nb_children_V1"] <- NA
pheno[which(pheno$Nb_children_V0 %in% c("-1", "-3")), "Nb_children_V0"] <- NA
pheno$Nb_children <- ifelse(is.na(pheno$Nb_children_V2), ifelse(is.na(pheno$Nb_children_V1),
    pheno$Nb_children_V0, pheno$Nb_children_V1), pheno$Nb_children_V2)

# Nb live birth (female only)
pheno[which(pheno$Nb_live_birth_V2 %in% c("-3")), "Nb_live_birth_V2"] <- NA
pheno[which(pheno$Nb_live_birth_V1 %in% c("-3")), "Nb_live_birth_V1"] <- NA
pheno[which(pheno$Nb_live_birth_V0 %in% c("-3")), "Nb_live_birth_V0"] <- NA
pheno$Nb_live_birth <- ifelse(is.na(pheno$Nb_live_birth_V2),
    ifelse(is.na(pheno$Nb_live_birth_V1), pheno$Nb_live_birth_V0,
        pheno$Nb_live_birth_V1), pheno$Nb_live_birth_V2)

# merging both
pheno$Nb_children_both <- as.factor(ifelse(is.na(pheno$Nb_children),
    pheno$Nb_live_birth, pheno$Nb_children))

# Age when first sexual intercourse
pheno[which(pheno$Sex_first_age_V2 %in% c("-1", "-3")), "Sex_first_age_V2"] <- NA
pheno[which(pheno$Sex_first_age_V1 %in% c("-1", "-3")), "Sex_first_age_V1"] <- NA
pheno[which(pheno$Sex_first_age_V0 %in% c("-1", "-3")), "Sex_first_age_V0"] <- NA

pheno$Sex_first_age <- as.numeric(ifelse(is.na(pheno$Sex_first_age_V2),
    ifelse(is.na(pheno$Sex_first_age_V1), pheno$Sex_first_age_V0,
        pheno$Sex_first_age_V1), pheno$Sex_first_age_V2))

# Scaling T1
pheno$Scaling_T1 <- as.factor(pheno$Scaling_T1)

# SNR
pheno$SNR <- as.numeric(pheno$SNR)

# Brain volume
pheno$Brain_volume <- as.numeric(pheno$Brain_volume)

# Discrepancy between T1 and standard template linearly
# aligned
pheno$Discrepancy_template <- as.numeric(pheno$Discrepancy_template)

#cognitiv score

# cognitiv score Anna Furtjes Verbal numeric reasoning
# (Fluid intelligence scores)
VNR_vars <- c("Fluid_intell_score_1_V0", "Fluid_intell_score_1_V1",
    "Fluid_intell_score_1_V2", "Fluid_intell_score_2")

# Matrix pattern completion
Matrix_vars <- c("Mat_pat_comp_V2", "Mat_pat_comp_V3")

# Symbol digit
SymbolDigit_vars <- c("Number_digit_2_V2", "Number_digit_2_V3",
    "Number_digit_1")

# PairMatching
PairMatching_vars <- c("Incorrect_match_V01", "Incorrect_match_V02",
    "Incorrect_match_V03", "Incorrect_match_V11", "Incorrect_match_V12",
    "Incorrect_match_V13", "Incorrect_match_V21", "Incorrect_match_V22",
    "Incorrect_match_V23")

# Reaction time
RT_vars <- c("Reaction_time_V0", "Reaction_time_V1", "Reaction_time_V2",
    "Reaction_time_V3")

Tower_vars <- c("Tower_rearr_V2", "Tower_rearr_V3")

# Trail Making
TrailMaking_vars <- c("Duration_alphanumeric_1", "Duration_alphanumeric_2")

# item 6350: entry of zero means not completed
pheno$Duration_alphanumeric_1 <- as.numeric(pheno$Duration_alphanumeric_1)
pheno$Duration_alphanumeric_1 <- ifelse(pheno$Duration_alphanumeric_1 <=
    0, NA, pheno$Duration_alphanumeric_1)

pheno$Duration_alphanumeric_2 <- as.numeric(pheno$Duration_alphanumeric_2)
pheno$Reaction_time_V0 <- as.numeric(pheno$Reaction_time_V0)
pheno$Reaction_time_V1 <- as.numeric(pheno$Reaction_time_V1)
pheno$Reaction_time_V2 <- as.numeric(pheno$Reaction_time_V2)
pheno$Reaction_time_V3 <- as.numeric(pheno$Reaction_time_V3)
pheno$Incorrect_match_V01 <- as.numeric(pheno$Incorrect_match_V01)
pheno$Incorrect_match_V02 <- as.numeric(pheno$Incorrect_match_V02)
pheno$Incorrect_match_V03 <- as.numeric(pheno$Incorrect_match_V03)
pheno$Incorrect_match_V11 <- as.numeric(pheno$Incorrect_match_V11)
pheno$Incorrect_match_V12 <- as.numeric(pheno$Incorrect_match_V12)
pheno$Incorrect_match_V13 <- as.numeric(pheno$Incorrect_match_V13)
pheno$Incorrect_match_V21 <- as.numeric(pheno$Incorrect_match_V21)
pheno$Incorrect_match_V22 <- as.numeric(pheno$Incorrect_match_V22)
pheno$Incorrect_match_V23 <- as.numeric(pheno$Incorrect_match_V23)
pheno$Fluid_intell_score_1_V0 <- as.numeric(pheno$Fluid_intell_score_1_V0)
pheno$Fluid_intell_score_1_V1 <- as.numeric(pheno$Fluid_intell_score_1_V1)
pheno$Fluid_intell_score_1_V2 <- as.numeric(pheno$Fluid_intell_score_1_V2)
pheno$Fluid_intell_score_2 <- as.numeric(pheno$Fluid_intell_score_2)
pheno$Mat_pat_comp_V2 <- as.numeric(pheno$Mat_pat_comp_V2)
pheno$Mat_pat_comp_V3 <- as.numeric(pheno$Mat_pat_comp_V3)
pheno$Number_digit_1 <- as.numeric(pheno$Number_digit_1)
pheno$Number_digit_2_V2 <- as.numeric(pheno$Number_digit_2_V2)
pheno$Number_digit_2_V3 <- as.numeric(pheno$Number_digit_2_V3)
pheno$Tower_rearr_V2 <- as.numeric(pheno$Tower_rearr_V2)
pheno$Tower_rearr_V3 <- as.numeric(pheno$Tower_rearr_V3)
all_vars <- c(VNR_vars, Matrix_vars, SymbolDigit_vars, PairMatching_vars,
    RT_vars, Tower_vars, TrailMaking_vars)
pheno$Tinnitus_yes <- ifelse(is.na(pheno$Tinnitus) | pheno$Tinnitus ==
    "Never", 0, 1)
# apply reverse function because in some variables a lower
# value means better cognitiv ability

reverse = function(a) {
    a * (-1)
}


pheno[, c(PairMatching_vars, TrailMaking_vars, RT_vars)] <- apply(pheno[,
    c(PairMatching_vars, TrailMaking_vars, RT_vars)], 2, FUN = reverse)

# Standardise variables
normFunc <- function(x) {
    (x - mean(x, na.rm = T))/sd(x, na.rm = T)
}

pheno[, all_vars] <- apply(pheno[, all_vars], 2, normFunc)
summary(pheno[, all_vars])

############################## 1 Verbal-numerical reasoning

pheno$mean_VNR <- rowMeans(pheno[, VNR_vars], na.rm = T)

summary(pheno$mean_VNR)

############################## 2 Matrix

pheno$mean_Matrix <- rowMeans(pheno[, Matrix_vars], na.rm = T)
summary(pheno$mean_Matrix)

############################## 3 Symbol Digit

pheno$mean_SymbolDigit <- rowMeans(pheno[, SymbolDigit_vars],
    na.rm = T)
summary(pheno$mean_SymbolDigit)

############################## 4 Pairs Matching
pheno$mean_PairMatching <- rowMeans(pheno[, PairMatching_vars],
    na.rm = T)
summary(pheno$mean_PairMatching)

############################## 5 Reaction Time RT_vars

pheno$mean_RT <- rowMeans(pheno[, RT_vars], na.rm = T)
summary(pheno$mean_RT)


############################## 6 Tower Tower_vars

pheno$mean_Tower <- rowMeans(pheno[, Tower_vars], na.rm = T)

summary(pheno$mean_Tower)

############################## 7 TrailMaking
############################## TrailMaking_vars 0
############################## represents 'Trail not
############################## completed


pheno$mean_TrailMaking <- rowMeans(pheno[, TrailMaking_vars],
    na.rm = T)
summary(pheno$mean_TrailMaking)

################################################## Create
################################################## Factor
################################################## Scores
library(lavaan)
gpheno <- pheno[, c("ID", "mean_VNR", "mean_Matrix", "mean_SymbolDigit",
    "mean_PairMatching", "mean_RT", "mean_Tower", "mean_TrailMaking")]

#### remove all participants who have missing values in all
#### cog vars
if (sum(rowSums(is.na(gpheno)) == 7) == 0) {
    print("Each participant has at least one cognitive measurement - woop")
}

# remove outliers for each cognitive variable
colSums(!is.na(gpheno))

for (i in c("mean_VNR", "mean_Matrix", "mean_SymbolDigit", "mean_PairMatching",
    "mean_RT", "mean_Tower", "mean_TrailMaking")) {
    # calculate volume specific mean
    mean_var <- mean(gpheno[, i], na.rm = T)
    # calculate volume specific sd
    sd_var <- sd(gpheno[, i], na.rm = T)
    # remove values that are beyond 4 SDs
    gpheno[, i][which(gpheno[, i] < mean_var - (sd_var * 4) |
        gpheno[, i] > mean_var + (sd_var * 4))] <- NA
}
colSums(!is.na(gpheno))

# build SEM model
gmodel <- "Cog =~ mean_VNR + mean_Matrix + mean_SymbolDigit + mean_PairMatching + mean_RT + mean_Tower + mean_TrailMaking"

# fit SEM model
fit <- cfa(gmodel, data = gpheno, missing = "ML")
save(fit, file = "g_score_model_fit.RData")
print(summary(fit))

# predict individual-level factor scores
test_gpheno <- as.data.frame(lavPredict(fit, newdata = gpheno,
    type = "lv", append.data = T))

# merge with gpheno to get correct eid's

gpheno <- merge(test_gpheno, gpheno, by = c("mean_VNR", "mean_Matrix",
    "mean_SymbolDigit", "mean_PairMatching", "mean_RT", "mean_Tower",
    "mean_TrailMaking"))

gpheno <- gpheno[, c("ID", "Cog")]

# only keep non-missing participants
gpheno <- gpheno[!is.na(gpheno$Cog), ]

### realised later OSCA wants IID FID value/ no col.names
gpheno$FID <- gpheno$ID

gpheno <- gpheno[, c("ID", "FID", "Cog")]

##### remove outliers outside of 4SDs
mean <- mean(gpheno$Cog, na.rm = T)
sd <- sd(gpheno$Cog, na.rm = T)
gpheno$Cog[which(gpheno$Cog < mean - (sd * 4) | gpheno$Cog >
    mean + (sd * 4))] <- NA

print(summary(gpheno))

pheno$Cog <- ifelse(pheno$ID %in% gpheno$ID, gpheno$Cog, NA)

library(lavaan)
library(semPlot)


## fit_reserve gives the same results as fit but with fully
## positive factor loadings (by reverse I mean that I have
## multiplied items with negative polarity by -1 so that
## all loadings are positive) we now want to get the
## variance explained by the model: we take the sum of the
## variance output for each item and divide by the number
## of items
total_variance_explained <- mean(1 - mean(rowSums(inspect(fit,
    what = "std")$theta)))
print(paste0("The total variance explained by this model is ",
    round(total_variance_explained, digits = 2)))

labels <- c("VNR", "Matrix", "Symbol\nDigit", "Pair\nMatching",
    "RT", "Tower", "Trail\nMaking", "Cog")

semPaths(fit, what = "paths", whatLabels = "est", intercepts = FALSE,
    style = "OpenMx", layout = "tree", nodeLabels = labels, curvePivot = TRUE,
    edge.color = "black", sizeMan = 8, pastel = TRUE, rainbowStart = 0.5,
    nDigits = 2, title = TRUE)

#pain

# reponses du choix multiple: 1 Headache 2 Facial pain 3
# Neck or shoulder 4 Back pain 5 Stomach pain 6 Hip pain 7
# Knee pain 8 Pain all over the body


pheno[which(pheno$Pain_V00 %in% c("-7", "-3", "8")), "Pain_V00"] <- NA
pheno[which(pheno$Pain_V01 %in% c("-7", "-3", "8")), "Pain_V01"] <- NA
pheno[which(pheno$Pain_V02 %in% c("-7", "-3", "8")), "Pain_V02"] <- NA
pheno[which(pheno$Pain_V03 %in% c("-7", "-3", "8")), "Pain_V03"] <- NA
pheno[which(pheno$Pain_V04 %in% c("-7", "-3", "8")), "Pain_V04"] <- NA
pheno[which(pheno$Pain_V05 %in% c("-7", "-3", "8")), "Pain_V05"] <- NA
pheno[which(pheno$Pain_V06 %in% c("-7", "-3", "8")), "Pain_V06"] <- NA
pheno[which(pheno$Pain_V10 %in% c("-7", "-3", "8")), "Pain_V10"] <- NA
pheno[which(pheno$Pain_V11 %in% c("-7", "-3", "8")), "Pain_V11"] <- NA
pheno[which(pheno$Pain_V12 %in% c("-7", "-3", "8")), "Pain_V12"] <- NA
pheno[which(pheno$Pain_V13 %in% c("-7", "-3", "8")), "Pain_V13"] <- NA
pheno[which(pheno$Pain_V14 %in% c("-7", "-3", "8")), "Pain_V14"] <- NA
pheno[which(pheno$Pain_V15 %in% c("-7", "-3", "8")), "Pain_V15"] <- NA
pheno[which(pheno$Pain_V16 %in% c("-7", "-3", "8")), "Pain_V16"] <- NA
pheno[which(pheno$Pain_V20 %in% c("-7", "-3", "8")), "Pain_V20"] <- NA
pheno[which(pheno$Pain_V21 %in% c("-7", "-3", "8")), "Pain_V21"] <- NA
pheno[which(pheno$Pain_V22 %in% c("-7", "-3", "8")), "Pain_V22"] <- NA
pheno[which(pheno$Pain_V23 %in% c("-7", "-3", "8")), "Pain_V23"] <- NA
pheno[which(pheno$Pain_V24 %in% c("-7", "-3", "8")), "Pain_V24"] <- NA
pheno[which(pheno$Pain_V25 %in% c("-7", "-3", "8")), "Pain_V25"] <- NA
pheno[which(pheno$Pain_V26 %in% c("-7", "-3", "8")), "Pain_V26"] <- NA

pheno$all_pains_V0 <- 0
pheno$all_pains_V1 <- 0
pheno$all_pains_V2 <- 0

pain_V0 <- function(x) {
    all_pains <- c(as.character(x[, c("Pain_V00", "Pain_V01",
        "Pain_V02", "Pain_V03", "Pain_V04", "Pain_V05", "Pain_V06")]))
    all_pains <- all_pains[!is.na(all_pains)]
    if ("1" %in% all_pains & (x[, c("Headache_V0")] != "1"))
        all_pains <- all_pains[all_pains != "1"]
    if ("2" %in% all_pains & (x[, c("Facial_pain_V0")] != "1"))
        all_pains <- all_pains[all_pains != "2"]
    if ("3" %in% all_pains & (x[, c("Neck_pain_V0")] != "1"))
        all_pains <- all_pains[all_pains != "3"]
    if ("4" %in% all_pains & (x[, c("Back_pain_V0")] != "1"))
        all_pains <- all_pains[all_pains != "4"]
    if ("5" %in% all_pains & (x[, c("Stomach_pain_V0")] != "1"))
        all_pains <- all_pains[all_pains != "5"]
    if ("6" %in% all_pains & (x[, c("Hip_pain_V0")] != "1"))
        all_pains <- all_pains[all_pains != "6"]
    if ("7" %in% all_pains & (x[, c("Knee_pain_V0")] != "1"))
        all_pains <- all_pains[all_pains != "7"]
    return(length(all_pains))

}

pain_V1 <- function(x) {
    all_pains <- c(as.character(x[, c("Pain_V10", "Pain_V11",
        "Pain_V12", "Pain_V13", "Pain_V14", "Pain_V15", "Pain_V16")]))
    all_pains <- all_pains[!is.na(all_pains)]
    if ("1" %in% all_pains & (x[, c("Headache_V1")] != "1"))
        all_pains <- all_pains[all_pains != "1"]
    if ("2" %in% all_pains & (x[, c("Facial_pain_V1")] != "1"))
        all_pains <- all_pains[all_pains != "2"]
    if ("3" %in% all_pains & (x[, c("Neck_pain_V1")] != "1"))
        all_pains <- all_pains[all_pains != "3"]
    if ("4" %in% all_pains & (x[, c("Back_pain_V1")] != "1"))
        all_pains <- all_pains[all_pains != "4"]
    if ("5" %in% all_pains & (x[, c("Stomach_pain_V1")] != "1"))
        all_pains <- all_pains[all_pains != "5"]
    if ("6" %in% all_pains & (x[, c("Hip_pain_V1")] != "1"))
        all_pains <- all_pains[all_pains != "6"]
    if ("7" %in% all_pains & (x[, c("Knee_pain_V1")] != "1"))
        all_pains <- all_pains[all_pains != "7"]
    return(length(all_pains))

}

pain_V2 <- function(x) {
    all_pains <- c(as.character(x[, c("Pain_V20", "Pain_V21",
        "Pain_V22", "Pain_V23", "Pain_V24", "Pain_V25", "Pain_V26")]))
    all_pains <- all_pains[!is.na(all_pains)]
    if ("1" %in% all_pains & (x[, c("Headache_V2")] != "1"))
        all_pains <- all_pains[all_pains != "1"]
    if ("2" %in% all_pains & (x[, c("Facial_pain_V2")] != "1"))
        all_pains <- all_pains[all_pains != "2"]
    if ("3" %in% all_pains & (x[, c("Neck_pain_V2")] != "1"))
        all_pains <- all_pains[all_pains != "3"]
    if ("4" %in% all_pains & (x[, c("Back_pain_V2")] != "1"))
        all_pains <- all_pains[all_pains != "4"]
    if ("5" %in% all_pains & (x[, c("Stomach_pain_V2")] != "1"))
        all_pains <- all_pains[all_pains != "5"]
    if ("6" %in% all_pains & (x[, c("Hip_pain_V2")] != "1"))
        all_pains <- all_pains[all_pains != "6"]
    if ("7" %in% all_pains & (x[, c("Knee_pain_V2")] != "1"))
        all_pains <- all_pains[all_pains != "7"]
    return(length(all_pains))

}

for (i in seq(1, dim(pheno)[1])) {
    x <- pheno[i, ]
    pheno[i, "all_pains_V0"] <- pain_V0(x)
}

for (i in seq(1, dim(pheno)[1])) {
    x <- pheno[i, ]
    pheno[i, "all_pains_V1"] <- pain_V1(x)
}

for (i in seq(1, dim(pheno)[1])) {
    x <- pheno[i, ]
    pheno[i, "all_pains_V2"] <- pain_V2(x)
}

summary(as.factor(pheno$all_pains_V0))
summary(as.factor(pheno$all_pains_V1))
summary(as.factor(pheno$all_pains_V2))

# merging all into one variable
pheno$all_pains <- as.factor(ifelse(is.na(pheno$all_pains_V2),
    ifelse(is.na(pheno$all_pains_V1), pheno$all_pains_V0, pheno$all_pains_V1),
    pheno$all_pains_V2))

#compute mean GM density

arg = commandArgs(trailingOnly = TRUE)

# entrée liste des paths
library(oro.nifti)
library(scales)
list_path = arg[1]
writing_path = arg[2]
working_directory_fsl = arg[3]
working_directory_cat12 = arg[4]

ids <- read.csv(list_path, header = FALSE, sep = "")
cat12 <- c()
anat <- c()
vbm <- c()
for (im in ids$V1) {
    path_fsl <- paste(as.character(working_directory_fsl), as.character(im),
        sep = "/")
    path_cat12 <- paste(as.character(working_directory_cat12),
        as.character(im), sep = "/")
    print(paste(path_fsl, "output/vbm/T1_brain_struc_GM_to_template_GM_mod.nii.gz",
        sep = "/"))
    print(paste(path_cat12, "OutputVol/mwp1T1_brain_padded.nii.gz",
        sep = "/"))
    img_i_anat <- c(readNIfTI(paste(path_fsl, "output/vbm/T1_brain_struc_GM_to_template_GM_mod.nii.gz",
        sep = "/"), reorient = FALSE))
    img_i_vbm <- c(readNIfTI(paste(path_fsl, "output/anat/T1_brain_pve_1_struc_GM_to_template_GM_mod.nii.gz",
        sep = "/"), reorient = FALSE))
    img_i_cat12 <- c(readNIfTI(paste(path_cat12, "OutputVol/mwp1T1_brain_padded.nii.gz",
        sep = "/"), reorient = FALSE))
    cat12 <- c(cat12, mean(c(img_i_cat12)))
    anat <- c(anat, mean(c(img_i_anat)))
    vbm <- c(vbm, mean(c(img_i_vbm)))
}

write.table(data.frame(cat12), file = paste(writing_path, "GM_density_cat12",
    sep = "/"))
write.table(data.frame(anat), file = paste(writing_path, "GM_density_anat",
    sep = "/"))
write.table(data.frame(vbm), file = paste(writing_path, "GM_density_vbm",
    sep = "/"))


wd="/network/lustre/iss02/ukbiobank/ukbiobank-remaining/elise.delzant"

cd ${wd}
bind="/network/lustre/iss02/ukbiobank/software" 
${bind}/qsubshcom "source /network/lustre/iss02/ukbiobank/ukbiobank-remaining/elise.delzant/code/source_conda|;
conda activate r_env|;
wd="/network/lustre/iss02/ukbiobank/ukbiobank-remaining/elise.delzant"|;
fsl="/network/lustre/iss02/ukbiobank/ukbiobank-remaining/elise.delzant/FSLVBM"|;
cat12="/network/lustre/iss02/ukbiobank/ukbiobank-remaining/elise.delzant/CAT12"|;
Rscript /network/lustre/iss02/ukbiobank/ukbiobank-remaining/elise.delzant/code/GM_density.R Downloadfsl/participants.tsv \${wd} \${fsl} \${cat12}|;  
" 4 24G mean_GM_density 24:00:00 "--partition=bigmem"

#add GM density to pheno df

GM_dens_cat12 <- read.table("../../Filezilla_dl/GM_density_cat12",
    header = F)
GM_dens_vbm <- read.table("../../Filezilla_dl/GM_density_vbm",
    header = F)
GM_dens_anat <- read.table("../../Filezilla_dl/GM_density_anat",
    header = F)

participants <- read.table("../../Filezilla_dl/participants.tsv")

df <- data.frame(GM_dens_anat$V1, GM_dens_cat12$V1, GM_dens_vbm$V1,
    participants$V1)
df <- df[df$participants.V1 %in% pheno$ID, ]

pheno$GM_dens_Vol <- df$GM_dens_cat12.V1
pheno$GM_dens_Vbm <- df$GM_dens_vbm.V1
pheno$GM_dens_Anat <- df$GM_dens_anat.V1

#total surface area freesurfer val

pheno$Surf_left_hemi <- as.numeric(pheno$Surf_left_hemi)
pheno$Surf_right_hemi <- as.numeric(pheno$Surf_right_hemi)
pheno$Total_surface_area <- pheno$Surf_left_hemi + pheno$Surf_right_hemi

#as factors

pheno$Sexe <- as.factor(pheno$Sexe)
levels(pheno$Sexe) <- c("Female", "Male")
pheno$Center <- as.factor(pheno$Center)
pheno$T1_nifti <- as.factor(pheno$T1_nifti)
pheno$T1_FreeSurfer <- as.factor(pheno$T1_FreeSurfer)
pheno[which(pheno$Cannabis == "-818"), "Cannabis"] <- NA
pheno$Cannabis <- as.factor(pheno$Cannabis)
levels(pheno$Cannabis) <- c("No", "1-2t.", "3-10t.", "11-100t.",
    "100+")
pheno[which(pheno$Ongoing_addiction == "-818"), "Ongoing_addiction"] <- NA
levels(pheno$Ongoing_addiction) <- c("No", "Yes")
pheno$Ongoing_addiction <- as.factor(pheno$Ongoing_addiction)
pheno$First_psychotic_age <- as.factor(pheno$First_psychotic_age)
pheno$Mean_rfmri <- as.numeric(pheno$Mean_rfmri)
pheno$X_pos <- as.numeric(pheno$X_pos)
pheno$Y_pos <- as.numeric(pheno$Y_pos)
pheno$Z_pos <- as.numeric(pheno$Z_pos)
pheno$IMD <- as.numeric(pheno$IMD)
pheno[which(pheno$Restlessness == "-818"), "Restlessness"] <- NA
pheno$Restlessness <- as.factor(pheno$Restlessness)
levels(pheno$Restlessness) <- c("Not at all", "Several days",
    "More than half the days", "Nearly every day")

# Date_centerV2 create gap between first MRI made from
# center and one's MRI
pheno$Date_center <- as.Date(pheno$Date_center)

pheno_c1 <- pheno[pheno$Center == 11025, ]
pheno_c2 <- pheno[pheno$Center == 11026, ]
pheno_c3 <- pheno[pheno$Center == 11027, ]
pheno_c4 <- pheno[pheno$Center == 11028, ]

min1 <- min(pheno_c1$Date_center)
min2 <- min(pheno_c2$Date_center)
min3 <- min(pheno_c3$Date_center)
min4 <- min(pheno_c4$Date_center)

pheno_c1$Diff_to_mri <- as.numeric(pheno_c1$Date_center - min1)
pheno_c2$Diff_to_mri <- as.numeric(pheno_c2$Date_center - min2)
pheno_c3$Diff_to_mri <- as.numeric(pheno_c3$Date_center - min3)
pheno_c4$Diff_to_mri <- as.numeric(pheno_c4$Date_center - min4)

pheno$Diff_to_mri <- NA
pheno[which(pheno$Center == 11025), "Diff_to_mri"] <- as.numeric(pheno_c1$Date_center -
    min1)
pheno[which(pheno$Center == 11026), "Diff_to_mri"] <- as.numeric(pheno_c2$Date_center -
    min2)
pheno[which(pheno$Center == 11027), "Diff_to_mri"] <- as.numeric(pheno_c3$Date_center -
    min3)
pheno[which(pheno$Center == 11028), "Diff_to_mri"] <- as.numeric(pheno_c4$Date_center -
    min4)

pheno$Diff_to_mri <- pheno$Diff_to_mri/365.25

save(pheno, file = "pheno_plots.RData")

#keeping only final variables

# center categories
cent <- c("ID", "Center")

# individual characteristics
indiv <- c("Sexe", "Assessment_center_age", "BMI", "Education_age",
    "Maternal_smoking", "IMD", "Sex_first_age", "Nb_children_both",
    "Brain_volume", "Waist_circumf", "Hip_circumf", "TBV")

# Addiction
addict <- c("smoking", "Alcohol_freq")

# Disease
disease <- c("Sleeplessness", "Diabetes", "High_blood_pressure",
    "Stroke", "Alzheimer", "Parkinson", "Tinnitus", "Restlessness",
    "all_pains")

# Depression and bipolarity
dep <- "Depression"

# T1 related variables
t1 <- c("X_pos", "Y_pos", "Diff_to_mri", "Mean_rfmri", "Discrepancy_template",
    "Discrepancy_template_non_lin", "Scaling_T1", "SNR", "GM_dens_Vol",
    "GM_dens_Anat", "GM_dens_Vbm", "Total_surface_area")

# Cognition
cog <- "Cog"

pheno_lmm <- pheno[, c(cent, indiv, addict, disease, dep, t1,
    cog)]

levels(pheno_lmm$Sexe) <- c(0, 1)
levels(pheno_lmm$Maternal_smoking) <- c(0, 1)
levels(pheno_lmm$smoking) <- c(0, 1, 2, 3, 4, 5, 6)
levels(pheno_lmm$Alcohol_freq) <- c(0, 1, 2, 3, 4)
levels(pheno_lmm$Sleeplessness) <- c(0, 1, 2)
levels(pheno_lmm$Restlessness) <- c(0, 1, 2, 3)
levels(pheno_lmm$Diabetes) <- c(0, 1)
levels(pheno_lmm$Parkinson) <- c(0, 1)
levels(pheno_lmm$Stroke) <- c(0, 1)
levels(pheno_lmm$High_blood_pressure) <- c(0, 1)
levels(pheno_lmm$Tinnitus) <- c(0, 1, 2, 3, 4)
levels(pheno_lmm$Depression) <- c(4, 3, 2, 1)
levels(pheno_lmm$Alzheimer) <- c(0, 1)
pheno_lmm$Nb_children_both <- as.numeric(pheno_lmm$Nb_children_both)
save(pheno_lmm, file = "pheno_lmm.RData")
write.table(colnames(pheno_lmm), "../phenotypes_list", row.names = F,
    col.names = F)

#divide into main and replicate

load("pheno_lmm.RData")

main <- pheno_lmm[which(pheno_lmm$Center == 11025), ]
replicate <- pheno_lmm[which(pheno_lmm$Center %in% c(11026, 11027,
    11028)), ]
save(main, file = "../Data/main.Rdata")
save(replicate, file = "../Data/replicate.RData")
write.table(as.data.frame(main$ID), file = "/Users/elise.delzant/Documents/01GM_Association/main_id.csv",
    row.names = FALSE, col.names = FALSE)
write.table(as.data.frame(replicate$ID), file = "/Users/elise.delzant/Documents/01GM_Association/replicate_id.csv",
    row.names = FALSE, col.names = FALSE)

A work by by Elise Delzant

Phenotype for LMM

by Elise Delzant - 31 July 2025