You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
100 lines
4.8 KiB
100 lines
4.8 KiB
library(readxl)
|
|
library(dplyr)
|
|
library(lubridate)
|
|
library(DT)
|
|
|
|
mutlstxlrdr <- function() {
|
|
for (i in seq_along(sheet.na)) {
|
|
colnames <-
|
|
unique(saptemplate[saptemplate$`Sheet Name` == snames[i], ]$Header)
|
|
df <- read.table("", col.names = colnames)
|
|
assign(snames[i], df)
|
|
}
|
|
}
|
|
filenames <- list.files("./contacts/CodeList", pattern = "*.xlsx", full.names = T) # We can avoid creating a separate directory for code list. But organizing may be difficult. However, this can be explored further if we want transform all the data in one go i.e. not by functions (contacts, accounts etc.).
|
|
# File paths
|
|
sheet_names <- lapply(filenames, excel_sheets) # Creates a list of the sheet names
|
|
for (i in seq_along(filenames)) {
|
|
codelist_files <-
|
|
lapply(excel_sheets(filenames[[i]]), read_excel, path = filenames[[i]]) # Reads the sheets of the excel files
|
|
names(codelist_files) <-
|
|
c(sheet_names[[i]]) # Renames them according to the sheet names extracted above
|
|
}
|
|
# Names of the files imported
|
|
oldfilepath <- ("./contacts/olddummy.xlsx")
|
|
old.data <-
|
|
lapply(excel_sheets(oldfilepath), read_excel, path = oldfilepath)
|
|
names(old.data) <- excel_sheets(oldfilepath)
|
|
# Names of the files imported
|
|
names(old.data)
|
|
saptemplate <-
|
|
read_excel("./contacts/Contact.xlsx", sheet = "Field_Definitions")
|
|
snames <- unique(saptemplate$`Sheet Name`)
|
|
# Creates data frame for each sheet in snames
|
|
for (i in seq_along(snames)) {
|
|
colnames <-
|
|
saptemplate[saptemplate$`Sheet Name` == snames[i], ]$Header # Defines the column names
|
|
df <- read.table("", col.names = colnames) # Creates an empty data frame using the column names
|
|
assign(snames[i], df) # Assigns value of df to a data frame named in sname
|
|
}
|
|
old.copy <- old.data$Contact_o # Selecting only one table as sample
|
|
mapped <- read.csv("./contacts/contact_map.csv", sep = ";")
|
|
x = NULL
|
|
for (i in 1:nrow(mapped)) {
|
|
x[i] = mapped[mapped$oldkey == colnames(old.copy[i]), ]$Header
|
|
}
|
|
colnames(old.copy) <- x # Changing column names
|
|
saptemplate[saptemplate$`Sheet Name` == "Contact", ] |>
|
|
filter(Mandatory == "Yes") |>
|
|
pull(Header) -> essential.rows # List of mandatory columns
|
|
essen.rows.table = read.table("", col.names = c("Item", "Missing"))
|
|
for (i in seq_along(essential.rows)) {
|
|
essen.rows.table[i, 2] <- sum(is.na(old.copy[, essential.rows[i]]))
|
|
essen.rows.table[i, 1] <- essential.rows[i]
|
|
} # Creates the table below
|
|
for (i in seq_along(essential.rows)) {
|
|
old.copy <- old.copy[!is.na(old.copy[, essential.rows[i]]), ]
|
|
} # Remove the rows with missing mandatory values
|
|
codelistcols <- saptemplate[saptemplate$`Sheet Name` == "Contact", ] |>
|
|
filter(!is.na(`CodeList File Path`)) |> pull(Header) # List of columns that have a codelist
|
|
codelisted.rows.table = read.table("", col.names = c("Item", "Missing", "Not_from_code"))
|
|
for (i in seq_along(codelistcols)) {
|
|
codelisted.rows.table[i, 3] <-
|
|
sum(!pull(old.copy[, codelistcols[i]], 1) %in% c(pull(codelist_files[codelistcols[i]][[1]], Description), NA)) # Added NA else empty columns also get counted
|
|
codelisted.rows.table[i, 2] <-
|
|
sum(is.na(old.copy[, codelistcols[i]]))
|
|
codelisted.rows.table[i, 1] <- codelistcols[i]
|
|
} # Creates the table below
|
|
codelisted.rows.table
|
|
for (i in seq_along(codelistcols)) {
|
|
old.copy[!pull(old.copy[, codelistcols[i]], 1) %in% c(pull(codelist_files[codelistcols[i]][[1]], Description), NA), codelistcols[i]] <-
|
|
NA
|
|
} # Removes the value in case of mismatch
|
|
for (i in seq_along(codelistcols)) {
|
|
old.copy[, codelistcols[i]] <-
|
|
pull(codelist_files[codelistcols[i]][[1]], 2)[match(pull(old.copy, codelistcols[i]),
|
|
pull(codelist_files[codelistcols[i]][[1]], Description))]
|
|
} # Matches each column with the corresponding code list and returns the value
|
|
dtype <-
|
|
saptemplate[saptemplate$`Sheet Name` == "Contact", ]$`Data Type` # List of data types. Non Exhaustive ATM
|
|
for (i in 1:ncol(old.copy)) {
|
|
if (dtype[i] == "String") {old.copy[, i] <- as.character(pull(old.copy, i))}
|
|
if (dtype[i] == "Boolean") {old.copy[, i] <- as.logical(pull(old.copy, i))}
|
|
if (dtype[i] == "DateTime") {old.copy[, i] <- lubridate::ymd_hms(pull(old.copy, i))}
|
|
if (dtype[i] == "Time") { old.copy[, i] <- lubridate::hms(pull(old.copy, i))} # This list will increase and also change based on input date and time formats
|
|
}
|
|
max.length <-
|
|
saptemplate[saptemplate$`Sheet Name` == "Contact", ]$`Max Length` # List of max lengths mentioned
|
|
|
|
colclasses <- lapply(old.copy, class) # getting column classes
|
|
|
|
for (i in 1:ncol(old.copy)) {
|
|
if (colclasses[[i]] == "character") {
|
|
old.copy[, i] <-
|
|
ifelse(nchar(pull(old.copy, i)) > max.length[i],
|
|
substring(pull(old.copy, i), 1, max.length[i]),
|
|
pull(old.copy, i))
|
|
} # If string length is more than mentioned, trim it to the mentioned
|
|
|
|
}
|
|
write.csv(old.copy, "Contact.csv", row.names = FALSE)
|
|
|