DataTransformationCRH/contact_script.R


								library(readxl)

								library(dplyr)

								library(lubridate)

								library(DT)


								mutlstxlrdr <- function() {

								  for (i in seq_along(sheet.na)) {

								    colnames <-

								      unique(saptemplate[saptemplate$`Sheet Name` == snames[i], ]$Header)

								    df <- read.table("", col.names = colnames)

								    assign(snames[i], df)

								  }

								}

								filenames <- list.files("./contacts/CodeList", pattern = "*.xlsx", full.names = T) # We can avoid creating a separate directory for code list. But organizing may be difficult. However, this can be explored further if we want transform all the data in one go i.e. not by functions (contacts, accounts etc.).

								# File paths

								sheet_names <- lapply(filenames, excel_sheets) # Creates a list of the sheet names

								for (i in seq_along(filenames)) {

								  codelist_files <-

								    lapply(excel_sheets(filenames[[i]]), read_excel, path = filenames[[i]]) # Reads the sheets of the excel files

								  names(codelist_files) <-

								    c(sheet_names[[i]]) # Renames them according to the sheet names extracted above

								}

								# Names of the files imported

								oldfilepath <- ("./contacts/olddummy.xlsx")

								old.data <-

								  lapply(excel_sheets(oldfilepath), read_excel, path = oldfilepath)

								names(old.data) <- excel_sheets(oldfilepath)

								# Names of the files imported

								names(old.data)

								saptemplate <-

								  read_excel("./contacts/Contact.xlsx", sheet = "Field_Definitions")

								snames <- unique(saptemplate$`Sheet Name`)

								# Creates data frame for each sheet in snames

								for (i in seq_along(snames)) {

								  colnames <-

								    saptemplate[saptemplate$`Sheet Name` == snames[i], ]$Header # Defines the column names

								  df <- read.table("", col.names = colnames) # Creates an empty data frame using the column names

								  assign(snames[i], df) # Assigns value of df to a data frame named in sname

								}

								old.copy <- old.data$Contact_o # Selecting only one table as sample

								mapped <- read.csv("./contacts/contact_map.csv", sep = ";")

								x = NULL

								for (i in 1:nrow(mapped)) {

								  x[i] = mapped[mapped$oldkey == colnames(old.copy[i]), ]$Header

								}

								colnames(old.copy) <- x # Changing column names

								saptemplate[saptemplate$`Sheet Name` == "Contact", ] |>

								  filter(Mandatory == "Yes") |>

								  pull(Header) -> essential.rows # List of mandatory columns

								essen.rows.table = read.table("", col.names = c("Item", "Missing"))

								for (i in seq_along(essential.rows)) {

								  essen.rows.table[i, 2] <- sum(is.na(old.copy[, essential.rows[i]]))

								  essen.rows.table[i, 1] <- essential.rows[i]

								} # Creates the table below

								for (i in seq_along(essential.rows)) {

								  old.copy <- old.copy[!is.na(old.copy[, essential.rows[i]]), ]

								} # Remove the rows with missing mandatory values

								codelistcols <- saptemplate[saptemplate$`Sheet Name` == "Contact", ] |>

								  filter(!is.na(`CodeList File Path`)) |> pull(Header) # List of columns that have a codelist

								codelisted.rows.table = read.table("", col.names = c("Item", "Missing", "Not_from_code"))

								for (i in seq_along(codelistcols)) {

								  codelisted.rows.table[i, 3] <-

								    sum(!pull(old.copy[, codelistcols[i]], 1) %in% c(pull(codelist_files[codelistcols[i]][[1]], Description), NA)) # Added NA else empty columns also get counted

								  codelisted.rows.table[i, 2] <-

								    sum(is.na(old.copy[, codelistcols[i]]))

								  codelisted.rows.table[i, 1] <- codelistcols[i]

								} # Creates the table below

								codelisted.rows.table

								for (i in seq_along(codelistcols)) {

								  old.copy[!pull(old.copy[, codelistcols[i]], 1) %in% c(pull(codelist_files[codelistcols[i]][[1]], Description), NA), codelistcols[i]] <-

								    NA

								} # Removes the value in case of mismatch

								for (i in seq_along(codelistcols)) {

								  old.copy[, codelistcols[i]] <-

								    pull(codelist_files[codelistcols[i]][[1]], 2)[match(pull(old.copy, codelistcols[i]),

								                                                        pull(codelist_files[codelistcols[i]][[1]], Description))]

								} # Matches each column with the corresponding code list and returns the value

								dtype <-

								  saptemplate[saptemplate$`Sheet Name` == "Contact", ]$`Data Type` # List of data types. Non Exhaustive ATM

								for (i in 1:ncol(old.copy)) {

								  if (dtype[i] == "String") {old.copy[, i] <- as.character(pull(old.copy, i))}

								  if (dtype[i] == "Boolean") {old.copy[, i] <- as.logical(pull(old.copy, i))}

								  if (dtype[i] == "DateTime") {old.copy[, i] <- lubridate::ymd_hms(pull(old.copy, i))}

								  if (dtype[i] == "Time") { old.copy[, i] <- lubridate::hms(pull(old.copy, i))} # This list will increase and also change based on input date and time formats

								}

								max.length <-

								  saptemplate[saptemplate$`Sheet Name` == "Contact", ]$`Max Length` # List of max lengths mentioned


								colclasses <- lapply(old.copy, class) # getting column classes


								for (i in 1:ncol(old.copy)) {

								  if (colclasses[[i]] == "character") {

								    old.copy[, i] <-

								      ifelse(nchar(pull(old.copy, i)) > max.length[i],

								             substring(pull(old.copy, i), 1, max.length[i]),

								             pull(old.copy, i))

								  } # If string length is more than mentioned, trim it to the mentioned


								}

								write.csv(old.copy, "Contact.csv", row.names = FALSE)