2 Read Cohort B Data

1 Read Data

We read the file data_to_harmonise_age_issue.csv using vroom::vroom

Code

cohort_B_data <- vroom::vroom(
  file = here::here("data-raw",
                    "Cohort_B",
                    "data_to_harmonise_age_issue.csv"),
  delim = ",",
  col_select = 1:2,
  show_col_types = FALSE,
  col_types = list(
    ID = vroom::col_character(),
    Age = vroom::col_integer()
    )
  ) |>  
  dplyr::rename(cohort_unique_id = "ID") |>
  # Remove rows when the ID value is NA
  dplyr::filter(!is.na(.data[["cohort_unique_id"]])) |>
  # Remove white spaces in column names
  dplyr::rename_all(stringr::str_trim) |> 
  # Check if cohort id is unique
  pointblank::rows_distinct(
    columns = "cohort_unique_id",
  )

To safeguard a csv file with issues, we can use the function vroom::problems

If there are issues with the data, the output of vroom::problems will be a tibble.

Code

cohort_B_data |> 
  vroom::problems()

# A tibble: 3 × 5
    row   col expected   actual  file                                           
  <int> <int> <chr>      <chr>   <chr>                                          
1     4     2 an integer missing D:/Jeremy/PortableR/RPortableWorkDirectory/har…
2    10     2 an integer missing D:/Jeremy/PortableR/RPortableWorkDirectory/har…
3    17     2 an integer missing D:/Jeremy/PortableR/RPortableWorkDirectory/har…

To check for this in an automatically, we can use pointblank::expect_row_count_match

Code

cohort_B_data |> 
  vroom::problems() |> 
  pointblank::expect_row_count_match(count = 0)

Error: Row counts for the two tables did not match.
The `expect_row_count_match()` validation failed beyond the absolute threshold level (1).
* failure level (1) >= failure threshold (1)

Suppose we have a csv file with no issues, we can safeguard it with the following code.

Code

cohort_B_data <- vroom::vroom(
  file = here::here("data-raw",
                    "Cohort_B",
                    "data_to_harmonise.csv"),
  delim = ",",
  col_select = 1:8,
  show_col_types = FALSE,
  col_types = list(
    ID = vroom::col_character(),
    Age = vroom::col_integer(),
    Sex = vroom::col_character(),
    Height = vroom::col_double(),
    Weight = vroom::col_double(),
    `Smoke History` = vroom::col_character(),
    `Chest Pain Character` = vroom::col_character(),
    Dyspnea = vroom::col_character()
    )
  ) |>  
  dplyr::rename(cohort_unique_id = "ID") |>
  # Remove rows when the ID value is NA
  dplyr::filter(!is.na(.data[["cohort_unique_id"]])) |>
  # Remove white spaces in column names
  dplyr::rename_all(stringr::str_trim) |> 
  # Check if cohort id is unique
  pointblank::rows_distinct(
    columns = "cohort_unique_id",
  )

cohort_B_data |> 
  vroom::problems() |> 
  pointblank::expect_row_count_match(count = 0)

2 Write Preprocessed File

We output data to be used for the next session.

Code

cohort_B_data |> 
  nanoparquet::write_parquet(
    file = here::here(
      params$analysis_folder,
      params$harmonisation_folder,
      params$preprocessing_folder,
      "01_Cohort_B_cleaned.parquet"
  )
)