| Sex | sex |
|---|---|
| Female | 0 |
| Male | 1 |
3 Extract Demographic
1 Read Preprocessed File
We read output data from the previous section.
2 Demographics and Behavioral parameters
2.1 Age and Sex
age_years will be mapped from the column Age.
sex is grouped as follows:
Code
age_gender_data <- cohort_B_data |>
dplyr::select(c("cohort_unique_id",
"Age",
"Sex")) |>
pointblank::col_vals_expr(
expr = ~ harmonisation::is_integer_vector(
cohort_A_data[["age"]],
allow_na = TRUE)
) |>
dplyr::mutate(
# Convert age to type integer
age_years = as.integer(.data[["Age"]]),
# Convert categorical columns to factors
sex = dplyr::case_when(
.data[["Sex"]] == "Female" ~ "0",
.data[["Sex"]] == "Male" ~ "1",
.default = NA_character_
),
`Sex` = forcats::fct_relevel(
.data[["Sex"]],
c("Female", "Male")
),
sex = forcats::fct_relevel(
.data[["sex"]],
c("0", "1")),
) |>
dplyr::relocate(
"sex",
.after = "Sex"
) |>
dplyr::relocate(
"age_years",
.after = "Age"
) |>
pointblank::col_vals_in_set(
columns = "sex",
set = c("0", "1")
) |>
pointblank::col_vals_between(
columns = "age_years",
left = 0,
right = 100,
inclusive = c(FALSE, TRUE),
na_pass = TRUE
)Code
if (params$show_table && knitr::is_html_output()) {
age_gender_data |>
harmonisation::reactable_with_download_csv_button()
}Remove unnecessary columns so that we can merge with the other fields.
Code
age_gender_data <- age_gender_data |>
dplyr::select(-c("Age", "Sex"))2.2 Height, Weight, BMI and BSA
height_cm will be mapped from the column Height. weight_kg will be mapped from the column Weight.
bsa_m2 in m^2 will be calculated as sqrt([Height(cm) x Weight(kg)]/3600) bmi will be calculated as Weight(kg)/((Height(m))^2)
All values are then converted to two decimal places.
Code
body_measurement_data <- cohort_B_data |>
dplyr::select(c("cohort_unique_id",
"Height", "Weight")) |>
dplyr::mutate(
height_cm = .data[["Height"]],
weight_kg = .data[["Weight"]],
bsa_m2 = sqrt((.data[["height_cm"]] * .data[["weight_kg"]]) / 3600),
bsa_m2 = harmonisation::round_to_nearest_digit(.data[["bsa_m2"]], digits = 2),
bmi = .data[["weight_kg"]] / ((.data[["height_cm"]] / 100)^2),
bmi = harmonisation::round_to_nearest_digit(.data[["bmi"]], digits = 2),
height_cm = harmonisation::round_to_nearest_digit(.data[["height_cm"]], digits = 2),
weight_kg = harmonisation::round_to_nearest_digit(.data[["weight_kg"]], digits = 2)
) |>
pointblank::col_vals_between(
columns = "bmi",
left = 10,
right = 50,
inclusive = c(TRUE, TRUE),
na_pass = TRUE
) Code
if (params$show_table && knitr::is_html_output()) {
body_measurement_data |>
harmonisation::reactable_with_download_csv_button()
}Remove unnecessary columns so that we can merge with the other fields.
Code
body_measurement_data <- body_measurement_data |>
dplyr::select(-c("Height", "Weight"))2.3 Smoking History
smoke_current is grouped as follows:
| Smoke History | smoke_current |
|---|---|
| non-smoker | 0 |
| past smoker | 0 |
| current smoker | 1 |
| NA | -1 |
smoke_past is grouped as follows:
| Smoke History | smoke_past |
|---|---|
| non-smoker | 0 |
| past smoker | 1 |
| current smoker | 0 |
| NA | -1 |
We do a check to ensure that we can only have these scenarios
smoke_currentas 1 andsmoke_pastas 0 for current smokerssmoke_currentas 0 andsmoke_pastas 1 for past smokerssmoke_currentas 0 andsmoke_pastas 0 for non-smokerssmoke_currentas -1 andsmoke_pastas -1 for unknown
Code
smoking_data <- cohort_B_data |>
dplyr::select(c("cohort_unique_id",
"Smoke History")) |>
dplyr::mutate(
smoke_current = dplyr::case_when(
is.na(.data[["Smoke History"]]) ~ "-1",
.data[["Smoke History"]] == "non-smoker" ~ "0",
.data[["Smoke History"]] == "past smoker" ~ "0",
.data[["Smoke History"]] == "current smoker" ~ "1",
.default = NA_character_
),
smoke_current = forcats::fct_relevel(
.data[["smoke_current"]],
c("0", "1")),
smoke_past = dplyr::case_when(
is.na(.data[["Smoke History"]]) ~ "-1",
.data[["Smoke History"]] == "non-smoker" ~ "0",
.data[["Smoke History"]] == "past smoker" ~ "1",
.data[["Smoke History"]] == "current smoker" ~ "0",
.default = NA_character_
),
smoke_past = forcats::fct_relevel(
.data[["smoke_past"]],
c("0", "1")),
`Smoke History` = forcats::fct(
.data[["Smoke History"]]
)
) |>
pointblank::col_vals_in_set(
columns = c("smoke_current", "smoke_past"),
set = c("0", "1", "-1")
) |>
pointblank::col_vals_expr(
expr = pointblank::expr(
(.data[["smoke_current"]] == "1" & .data[["smoke_past"]] == "0") |
(.data[["smoke_current"]] == "-1" & .data[["smoke_past"]] == -"1") |
(.data[["smoke_current"]] == "0" & .data[["smoke_past"]] %in% c("0", "1"))
)
)Code
if (params$show_table && knitr::is_html_output()) {
smoking_data |>
harmonisation::reactable_with_download_csv_button()
}Remove unnecessary columns so that we can merge with the other fields.
Code
smoking_data <- smoking_data |>
dplyr::select(-c("Smoke History"))2.4 Chest Pain
2.4.1 Shortness of Breath
have_sob is grouped as follows:
| Dyspnea | have_sob |
|---|---|
| no | 0 |
| yes | 1 |
Code
shortness_of_breath_data <- cohort_B_data |>
dplyr::select(c("cohort_unique_id", "Dyspnea")) |>
dplyr::mutate(
have_sob = dplyr::case_when(
.data[["Dyspnea"]] == "no" ~ "0",
.data[["Dyspnea"]] == "yes" ~ "1",
.default = NA_character_
),
have_sob = forcats::fct_relevel(
as.character(.data[["have_sob"]]),
c("0", "1")),
Dyspnea = forcats::fct_relevel(
as.character(.data[["Dyspnea"]]),
c("no", "yes")),
) |>
pointblank::col_vals_in_set(
columns = c("have_sob"),
set = c("0", "1", "-1")
) Code
if (params$show_table && knitr::is_html_output()) {
shortness_of_breath_data |>
harmonisation::reactable_with_download_csv_button()
}Remove unnecessary columns so that we can merge with the other fields.
Code
shortness_of_breath_data <- shortness_of_breath_data |>
dplyr::select(-c("Dyspnea"))2.4.2 Have chest pain or not
have_chest_pain is grouped as follows:
| Chest Pain Character | have_chest_pain |
|---|---|
| no chest pain | 0 |
| typical, atypical or nonanginal | 1 |
Code
have_chest_pain_data <- cohort_B_data |>
dplyr::select(c("cohort_unique_id", "Chest Pain Character")) |>
dplyr::mutate(
have_chest_pain = dplyr::case_when(
.data[["Chest Pain Character"]] %in% c("no chest pain") ~ "0",
.data[["Chest Pain Character"]] %in% c("typical", "atypical", "nonanginal") ~ "1",
.default = NA_character_
),
have_chest_pain = forcats::fct_relevel(
.data[["have_chest_pain"]],
c("0", "1")
),
`Chest Pain Character` = forcats::fct_relevel(
as.character(.data[["Chest Pain Character"]]),
c("no chest pain", "typical", "atypical", "nonanginal")
)
) |>
pointblank::col_vals_in_set(
columns = c("have_chest_pain"),
set = c("0", "1")
) Code
if (params$show_table && knitr::is_html_output()) {
have_chest_pain_data |>
harmonisation::reactable_with_download_csv_button()
}Remove unnecessary columns so that we can merge with the other fields.
Code
have_chest_pain_data <- have_chest_pain_data |>
dplyr::select(-c("Chest Pain Character"))2.4.3 Symptomatic or Asymptomatic
symptoms is grouped as follows:
| have_sob | have_chest_pain | symptoms |
|---|---|---|
| -1 | -1 | -1 |
| 0 | 0 | 0 |
| 0 or 1 | 1 | 1 |
| 1 | 0 | 2 |
Code
symptoms_data <- cohort_B_data |>
dplyr::select(c("cohort_unique_id")) |>
dplyr::inner_join(shortness_of_breath_data,
by = dplyr::join_by("cohort_unique_id"),
unmatched = "error",
relationship = "one-to-one") |>
dplyr::inner_join(have_chest_pain_data,
by = dplyr::join_by("cohort_unique_id"),
unmatched = "error",
relationship = "one-to-one") |>
dplyr::mutate(
symptoms = dplyr::case_when(
(.data[["have_chest_pain"]] == "-1" &
.data[["have_sob"]] == "-1"
) ~ "-1",
(.data[["have_chest_pain"]] == "0" &
.data[["have_sob"]] == "0"
) ~ "0",
(.data[["have_chest_pain"]] == "1" &
.data[["have_sob"]] %in% c("0", "1")
) ~ "1",
(.data[["have_chest_pain"]] == "0" &
.data[["have_sob"]] == "1"
) ~ "2",
.default = NA_character_
),
symptoms = forcats::fct_relevel(
.data[["symptoms"]],
c("0", "1", "2"))
) |>
pointblank::col_vals_in_set(
columns = c("symptoms"),
set = c("0", "1", "2")
)Code
if (params$show_table && knitr::is_html_output()) {
symptoms_data |>
harmonisation::reactable_with_download_csv_button()
}Remove unnecessary columns so that we can merge with the other fields.
Code
symptoms_data <- symptoms_data |>
dplyr::select(-c("have_chest_pain", "have_sob"))2.4.4 Chest Pain Type
chest_pain_type is grouped as follows:
| Dyspnea | Chest Pain Character | chest_pain_type |
|---|---|---|
| no | no chest pain | 0 |
| no or yes | typical | 1 |
| no or yes | atypical | 2 |
| no or yes | nonanginal | 3 |
| yes | no chest pain | 4 |
Code
chest_pain_type_data <- cohort_B_data |>
dplyr::select(c("cohort_unique_id", "Chest Pain Character", "Dyspnea")) |>
dplyr::mutate(
chest_pain_type = dplyr::case_when(
(.data[["Chest Pain Character"]] == "no chest pain" &
.data[["Dyspnea"]] == "no"
) ~ "0",
(.data[["Chest Pain Character"]] == "typical" &
.data[["Dyspnea"]] %in% c("no", "yes")
) ~ "1",
(.data[["Chest Pain Character"]] == "atypical" &
.data[["Dyspnea"]] %in% c("no", "yes")
) ~ "2",
(.data[["Chest Pain Character"]] == "nonanginal" &
.data[["Dyspnea"]] %in% c("no", "yes")
) ~ "3",
(.data[["Chest Pain Character"]] == "no chest pain" &
.data[["Dyspnea"]] == "yes"
) ~ "4",
.default = NA_character_
),
`Chest Pain Character` = forcats::fct_relevel(
as.character(.data[["Chest Pain Character"]]),
c("no chest pain", "typical", "atypical", "nonanginal")
),
`Dyspnea` = forcats::fct_relevel(
as.character(.data[["Dyspnea"]]),
c("no", "yes")
),
chest_pain_type = forcats::fct_relevel(
.data[["chest_pain_type"]],
c("0", "1", "2", "3"))
) |>
dplyr::relocate(
"Chest Pain Character",
.after = "cohort_unique_id"
) |>
pointblank::col_vals_in_set(
columns = c("chest_pain_type"),
set = c("0", "1", "2", "3", "4")
)Code
if (params$show_table && knitr::is_html_output()) {
chest_pain_type_data |>
harmonisation::reactable_with_download_csv_button()
}Remove unnecessary columns so that we can merge with the other fields.
Code
chest_pain_type_data <- chest_pain_type_data |>
dplyr::select(-c("Dyspnea", "Chest Pain Character"))2.5 Combine Demographics
We combine all the data to give the demo_behave_data.
Code
join_specification <- dplyr::join_by("cohort_unique_id")
demo_behave_data <- cohort_B_data |>
dplyr::select(c("cohort_unique_id")) |>
dplyr::inner_join(age_gender_data,
by = join_specification,
unmatched = "error",
relationship = "one-to-one") |>
dplyr::inner_join(body_measurement_data,
by = join_specification,
unmatched = "error",
relationship = "one-to-one") |>
dplyr::inner_join(smoking_data,
by = join_specification,
unmatched = "error",
relationship = "one-to-one") |>
dplyr::inner_join(chest_pain_data,
by = join_specification,
unmatched = "error",
relationship = "one-to-one") |>
dplyr::relocate(c("bsa_m2", "bmi"),
.after = "sex")
testthat::expect_true(
pointblank::has_columns(
demo_behave_data,
columns = c(
"age_years", "sex",
"height_cm", "weight_kg", "bsa_m2", "bmi",
"smoke_current", "smoke_past",
"have_sob", "have_chest_pain",
"symptoms", "chest_pain_type"
)
)
)
testthat::expect_equal(
ncol(demo_behave_data), 13
)3 Write Preprocessed File
We output data to be used for the next session.
Code
demo_behave_data |>
nanoparquet::write_parquet(
file = here::here(
params$analysis_folder,
params$harmonisation_folder,
params$preprocessing_folder,
"02_demographic_data.parquet"
)
)