DataTransformationCRH/Contacts.Rmd

---
title: "Contacts"
author: "Scary Scarecrow"
date: "12/27/2021"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
strt <- Sys.time()
library(readxl)
library(dplyr)
library(lubridate)
library(DT)
library(tidyr)

mutlstxlrdr <- function() {
  for (i in seq_along(sheet.na)) {
    colnames <-
      unique(saptemplate[saptemplate$`Sheet Name` == snames[i], ]$Header)
    df <- read.table("", col.names = colnames)
    assign(snames[i], df)
    
  }
}

do.call(file.remove, list(list.files(
  "./contacts/errors/mandatory/", full.names = TRUE
)))
do.call(file.remove, list(list.files(
  "./contacts/errors/codelist/", full.names = TRUE
)))
do.call(file.remove, list(list.files(
  "./contacts/errors/length/", full.names = TRUE
)))
do.call(file.remove, list(list.files("./contacts/summary/", full.names = TRUE)))
do.call(file.remove, list(list.files("./contacts/output/", full.names = TRUE)))
```


## Data transformation workflow

Following is the proposed preliminary workflow for the data transformation project.


>All file of a segment (contacts/accounts etc..) should be inside the relevant folder. Each folder should have one folder for all codelist files. All legacy data (one file for each country) should be inside the raw-data folder, named after each country. Another file having field definitions including name of the matching column from the legacy file should also be there.

>*Make sure that there are no hidden files inside the directory.*

### Employees

```{r}
# employeecodes<-read.csv("emp.csv")
# employeecodes<-employeecodes |> select(c(1,2))
# employeecodesnew<-read.csv("./employees/empoct.csv") |> 
#   select(c(Employee_ID,First_Name,Last_Name)) |> 
#   mutate(Name=paste(First_Name, Last_Name)) |> 
#   select(Employee_ID,Name) |> 
#   rename(Employee.ID=Employee_ID)
# employeecodes<-rbind(employeecodes,employeecodesnew) |> 
#   unique()
employeecodes<-read.csv("./employees/empoct.csv") |> 
  mutate(Name=paste(First_Name, Last_Name)) |> 
  select(Employee_ID,Name)

```


### Code Lists


```{r Create List of Files, echo=TRUE, message=FALSE, warning=FALSE}

filenames <-
  list.files("./contacts/CodeList",
             pattern = "*.xlsx",
             full.names = T) # We can avoid creating a separate directory for code list. But organizing may be difficult. However, this can be explored further if we want transform all the data in one go i.e. not by functions (contacts, accounts etc.).

# File paths
print(filenames)
```


Check manually if the above list includes all the codelist files
If correct, then read the files.

```{r codelistreader, echo=TRUE, message=FALSE, warning=FALSE}
sheet_names <-
  lapply(filenames, excel_sheets) # Creates a list of the sheet names
codelist_files <- NULL
for (i in seq_along(filenames)) {
  a <-
    lapply(excel_sheets(filenames[[i]]),
           read_excel,
           path = filenames[[i]],
           col_types = "text") # Reads the sheets of the excel files
  names(a) <-
    c(sheet_names[[i]]) # Renames them according to the sheet names extracted above
  codelist_files <- c(codelist_files, a)
}
# Names of the files imported
names(codelist_files)
#codelist_files<-unique(codelist_files)
codelist_files$Title
```


### Templates


Let us now extract the data. Below we are reading only one file having all data related to `Contacts` from the legacy system.

```{r readlegacyfilepath, echo=TRUE, message=FALSE, warning=FALSE}

oldfilepath <-
  list.files("./contacts/raw-data/",
             pattern = "*.xls",
             full.names = T)
print(oldfilepath)
```

Check it the list matches the actual files, manually.

```{r readlegacyfiles, echo=TRUE, message=FALSE, warning=FALSE}

old_files <- NULL

#read_excel(path = oldfilepath[[i]], sheet = 1)
for (i in seq_along(oldfilepath)) {
  a<- read_excel(path = oldfilepath[[i]], sheet = 1)
  a<-a |> 
    left_join(employeecodes, by=c(`Full Name (Owning User)` = "Name")) |> 
    select(-`Full Name (Owning User)`) |> 
    mutate(Employee_ID=ifelse(is.na(Employee_ID),"99999",Employee_ID)) |> 
    rename(`Full Name (Owning User)`=Employee_ID)
  old_files[[i]]<-a
}

names(old_files) <- gsub("./contacts/raw-data/", "", oldfilepath)
```


*Some errors in the legacy file noticed. Columns with similar or same name exists.*


```{r readSAPtemplate, echo=TRUE, message=FALSE, warning=FALSE}
saptemplate <-
  read_excel("./contacts/template.xlsx", sheet = "Field_Definitions")
# First few rows of the imported data
head(saptemplate)
```


*Please note that the format of the tables (sheet) has been slightly changed. Earlier the corresponding sheet name was mentioned in a row before the actual table. Now, all the rows mention the corresponding sheet name. This was done manually for convenience of data extraction*


```{r createmptySAPfiles, message=FALSE, warning=FALSE, include=FALSE}
#orilo<-"en_US.UTF-8"
#Sys.setlocale(locale="en_US.UTF-8")
isValidEmail <- function(x) {
    grepl("\\<[A-Z0-9._%+-]+@[A-Z0-9.-]+\\.[A-Z]{2,}\\>", as.character(x), ignore.case=TRUE)
}
snames <- unique(saptemplate$`Sheet Name`)

for (h in seq_along(old_files)) {
  # Copy original data
  old.copy <- old_files[[h]]
  print(paste0(names(old_files[h]), " imported"))
  err.summ <-
    data.frame(
      Country = NULL,
      Name = NULL,
      Expected = NULL,
      Actual = NULL
    ) #Error Cal
  # Creates data frame for each sheet in snames
  for (i in seq_along(snames)) {
    print(paste0("Processing ..", snames[i]))
    
    if (snames[i] == "Contact") {
      # Select the column names from the field description sheet
      print("Creating template")
      sel.template.desc <-
        saptemplate[saptemplate$`Sheet Name` == snames[i],]
      print("Creating column names")
      sel.template.desc.colnames <- sel.template.desc$Header
      
      # Create a list by adding values from corresponding legacy data
      temp <- NULL
      print("adding values to template ")
      for (j in seq_along(sel.template.desc.colnames)) {
        temp[j] <-
          ifelse(
            !is.na(sel.template.desc$default[j]),
            as.character(as.vector(sel.template.desc$default[j])),
            ifelse(
              sel.template.desc$oldkey[j] == "NA" |
                is.na(sel.template.desc$oldkey[j]),
              NA,
              as.vector(old.copy[, sel.template.desc$oldkey[j]])
            )
          )
        
        
      }
      
      # Rename the columns according to field description
      print("renaming template ")
      names(temp) <- sel.template.desc.colnames
      
      # Create data frame from the list
      df <- as.data.frame(temp)
      print("Converted to data frame")
      df$Title <- ifelse(df$Title == "Mrs.", "Ms.", df$Title)
      # df$CountryRegion <- ifelse(!is.na(df$CountryRegion),
      #                            toupper(substr(names(old_files[h]), 2, 3)),
      #                            df$CountryRegion)
      df$CountryRegion <- toupper(substr(names(old_files[h]), 2, 3))
      
      # if(names(old_files)=="/CN.xlsx"){
      #     df$Contact_Owner_ID<-"226"
      #   }
      #   if(names(old_files)=="/CZ.xlsx"){
      #     df$Contact_Owner_ID<-"390"
      #   }
      #   if(names(old_files)=="/FI.xlsx"){
      #     df$Contact_Owner_ID<-"325"
      #   }
      #   if(names(old_files)=="/DE.xlsx"){
      #     df$Contact_Owner_ID<-"289"
      #   }
      #   if(names(old_files)=="/IT.xlsx"){
      #     df$Contact_Owner_ID<-"182"
      #   }
      #   if(names(old_files)=="/PL.xlsx"){
      #     df$Contact_Owner_ID<-"368"
      #   }
      #   if(names(old_files)=="/ES.xlsx"){
      #     df$Contact_Owner_ID<-"447"
      #   }
      #   if(names(old_files)=="/SE.xlsx"){
      #     df$Contact_Owner_ID<-"351"
      #   }
      #   if(names(old_files)=="/NL.xlsx"){
      #     df$Contact_Owner_ID<-"90052"
      #   }
      #   if(names(old_files)=="/NO.xlsx"){
      #     df$Contact_Owner_ID<-"000"
      #   }
      
      # Error summary file
      Expected <- nrow(df)
      
      #Select essential rows
      print("Identifying essential rows")
      sel.template.desc |>
        filter(Mandatory == "Yes") |>
        pull(Header) -> essential.columns
      
      
      error.mandatory <- NULL
      error.df <-
        data.frame(
          Country = NULL,
          Name = NULL,
          Rows = NULL,
          Expected = NULL
        )
      # Operate on essential columns including creation of error file
      for (k in seq_along(essential.columns)) {
        if (essential.columns[k] == "Department") {
          print("Department found")
          #stop()
          df$Department <- paste0("Z", substr(names(old_files[h]), 2, 3))
        }
        
        print("Creating and writing data with missing mandatory values")
        manerrdt <- df[is.na(df[, essential.columns[k]]),]
        if (nrow(manerrdt > 0)) {
          manerrdt <-
            manerrdt |> mutate(error = paste0(essential.columns[k], " missing"))
        }
        assign(
          paste0(
            "error_mandatory_",
            substr(names(old_files[h]), 2, 3),
            "_",
            snames[i],
            "_",
            essential.columns[k]
          ),
          manerrdt
        )
        # TO be saved in error files
        
        if (nrow(manerrdt) > 0) {
          write.csv(
            manerrdt,
            paste0(
              "./contacts/errors/mandatory/",
              substr(names(old_files[h]), 2, 3),
              "_",
              snames[i],
              "_",
              essential.columns[k],
              "_error_mandatory.csv"
            ),
            row.names = F,
            na = "",
            fileEncoding = "UTF-8"
          )
        }
        # Error summary file
        Country <- substr(names(old_files[h]), 2, 3)
        Name <- snames[i]
        err.type <- paste0("Missing ", essential.columns[k])
        err.count <- nrow(df[is.na(df[, essential.columns[k]]),])
        
        
        print("Removing rows with empty essential columns")
        df <- df[!is.na(df[, essential.columns[k]]),]
        if (err.count > 0) {
          error.df <-
            rbind(
              error.df,
              data.frame(
                Country = Country,
                Name = Name,
                err.type = err.type,
                err.count = err.count
              )
            ) #Error cal
        }
      }
      
      
      print("Identifying columns associated with codelists")
      # List of columns that have a codelist
      codelistcols <- sel.template.desc |>
        filter(!is.na(`CodeList File Path`)) |> pull(Header)
      for (k in seq_along(codelistcols)) {
        print(paste0("Identifying errors ", codelistcols[k]))
        def.rows <-
          which(!df[, codelistcols[k]] %in% c(pull(codelist_files[codelistcols[k]][[1]], Description), NA))
        def.n <- df[def.rows, 1]
        def.rows.val <-
          df[!df[, codelistcols[k]] %in% c(pull(codelist_files[codelistcols[k]][[1]], Description), NA), codelistcols[k]]
        def.colname <-
          rep(codelistcols[k], length.out = length(def.rows))
        def <- data.frame(def.rows, def.n, def.rows.val, def.colname)
        if (nrow(def > 0)) {
          assign(paste0(
            "error_codematch_",
            substr(names(old_files[1]), 1, 2),
            "_",
            snames[i],
            "_",
            codelistcols[k]
          ),
          def) # TO be saved
          write.csv(
            def,
            paste0(
              "./contacts/errors/codelist/",
              substr(names(old_files[h]), 2, 3),
              "_",
              snames[i],
              "_",
              codelistcols[k],
              "_error_codematch_.csv"
            ),
            row.names = F,
            na = "",
            fileEncoding = "UTF-8"
          )
        }
        err.type <-
          paste0("Codelist Mismatch ", codelistcols[k]) #Error cal
        err.count <- nrow(def) #Error cal
        if (err.count > 0) {
          error.df <-
            rbind(
              error.df,
              data.frame(
                Country = Country,
                Name = Name,
                err.type = err.type,
                err.count = err.count
              )
            ) #Error cal
        }
        
        
        print(paste0("Removing errors ", codelistcols[k]))
        # Removes any mismatch
        df[!df[, codelistcols[k]] %in% c(pull(codelist_files[codelistcols[k]][[1]], Description), NA), codelistcols[k]] <-
          NA
        
        # Matches each column with the corresponding code list and returns the value
        df[, codelistcols[k]] <-
          as.character(pull(codelist_files[codelistcols[k]][[1]], 2)[match(pull(df, codelistcols[k]),
                                                                           pull(codelist_files[codelistcols[k]][[1]], Description))])
        
      }
      max.length <- as.numeric(sel.template.desc$`Max Length`)
      dtype <- sel.template.desc$`Data Type`
      rowval <- NULL
      ival <- NULL
      rval <- NULL
      lenght.issue.df <- NULL
      fname <- NULL
      lname <- NULL
      owner <- NULL
      # Changing the data class
      for (k in 1:ncol(df)) {
        if (dtype[k] == "String") {
          df[, k] <- as.character(pull(df, k))
        }
        if (dtype[k] == "Boolean") {
          df[, k] <- as.logical(pull(df, k))
        }
        if (dtype[k] == "DateTime") {
          df[, k] <- lubridate::ymd_hms(pull(df, k))
        }
        if (dtype[k] == "Time") {
          df[, k] <- lubridate::hms(pull(df, k))
          
        } # This list will increase and also change based on input date and time formats
        
        
      }
      
      print("Rectifying streetname")
      # Street and House Number
      if (any(colnames(df) == "Street")) {
        print("found steet")
        # stop()
        
        df$Streetname <- NA
        df$HouseNumber <- NA
        #df |> extract("Street", "(\\D+)(\\d.*)")
        df <- tidyr::extract(df,
                             "Street",
                             c("Streetname", "HouseNumber"),
                             "(\\D+)(\\d.*)")
        df <- df |>
          select(-c("House_Number")) |>
          rename(Street = Streetname, House_Number = HouseNumber) |>
          select(all_of(sel.template.desc.colnames))
        
      }
      
      # Rectifying Phone, Mobile and Fax numbers
      if (any(colnames(df) == "Phone")) {
        print("Found Phone")
        df$Phone <- gsub("[+]", "00", df$Phone)
      }
      
      if (any(colnames(df) == "Mobile")) {
        print("Found Mobile")
        df$Mobile <- gsub("[+]", "00", df$Mobile)
      }
      
      if (any(colnames(df) == "Mobile")) {
        print("Found Mobile")
        df$Mobile <- gsub("[+]", "00", df$Mobile)
      }
      
      
      # Length Rectification
      colclasses <- lapply(df, class)
      print("Rectifying Length")
      for (k in 1:ncol(df)) {
        if (colclasses[[k]] == "character") {
          print("found character column ")
          rowval <- pull(df, 1)
          ival <-
            ifelse(nchar(pull(df, k)) == 0 |
                     is.na(nchar(pull(df, k))), 1, nchar(pull(df, k)))
          rval <- max.length[k]
          colval <- pull(df, k)
          colnm <- colnames(df)[k]
          cntr <- substr(names(old_files[h]), 2, 3)
          fname <- pull(df, 8)
          lname <- pull(df, 9)
          owner <- pull(df, 47)
          # rectifying data length
          df[, k] <-
            ifelse(nchar(pull(df, k)) > max.length[k],
                   substring(pull(df, k), 1, max.length[k]),
                   pull(df, k))
        }
        # Add name and email
        lenght.issue.df <-
          rbind(
            lenght.issue.df,
            data.frame(
              rowval,
              ival,
              rval,
              colnm,
              colval,
              cntr,
              fname,
              lname,
              owner
            )
          )
        
        err.type <-
          paste0("Length error ", colnames(df)[k]) # Error cal
        err.count <- sum(ival > rval, na.rm = T) # Error cal
        if (err.count > 0) {
          error.df <-
            rbind(
              error.df,
              data.frame(
                Country = Country,
                Name = Name,
                err.type = err.type,
                err.count = err.count
              )
            ) #Error cal
        }
        
        
      }
      
      lenght.issue.df <- dplyr::filter(lenght.issue.df, ival > rval)
      
      
      if (nrow(lenght.issue.df) > 0) {
        write.csv(
          lenght.issue.df,
          paste0(
            "./contacts/errors/length/",
            substr(names(old_files[h]), 2, 3),
            "_",
            snames[i],
            "_length_error.csv"
          ),
          row.names = F,
          na = "",
          fileEncoding = "UTF-8"
        )
      }
      
      assign(snames[i], df)
      df<- df |> mutate(EMail=ifelse(isValidEmail(EMail),EMail,"missing@leviat.com"))
      write.csv(
        df,
        paste0(
          "./contacts/output/",
          substr(names(old_files[h]), 2, 3),
          "_",
          snames[i],
          ".csv"
        ),
        row.names = F,
        sep=",",
        na = "",
        fileEncoding = "UTF-8"
      )
      if (nrow(error.df) > 0) {
        write.csv(
          error.df,
          paste0(
            "./contacts/summary/",
            substr(names(old_files[h]), 2, 3),
            "_",
            snames[i],
            "_error",
            ".csv"
          ),
          row.names = F,
          na = "",
          fileEncoding = "UTF-8"
        ) # Error write
      }
    }
    
    err.summ <-
      rbind(
        err.summ,
        data.frame(
          Country = Country,
          Name = Name,
          Expected = Expected,
          Actual = nrow(df)
        )
      ) #Error Cal
    
  }
  write.csv(
    err.summ,
    paste0(
      "./contacts/summary/" ,
      substr(names(old_files[h]), 2, 3),
      "_",
      snames[i],
      "_sumerror",
      ".csv"
    ),
    row.names = F,
    na = "",
    fileEncoding = "UTF-8"
  ) # Error Write
}

end <- Sys.time()

end - strt


```
*The code failed because Department Column appears several times in the data and while importing R renamed them to Department..xx).*
*Manually verify if these are the required templates*

```{r}
opfilepath <-
  list.files("./contacts/output",
             pattern = "*.csv",
             full.names = T)
opfiles <- lapply(opfilepath, read.csv, colClasses = "character", header=TRUE, row.names=NULL)
opdf <- do.call(rbind.data.frame, opfiles)
write.csv(
  opdf,
  "./contacts/output/combined/combined.csv",
  row.names = F,
  na = "",
  fileEncoding = "UTF-8"
)

openxlsx::write.xlsx(opdf,"./contacts/output/combined/combined.xlsx")

```


# Duplicate check

```{r}
contwav2<-read.csv("./contacts/output/combined/combined.csv") |> 
  mutate(FullName=paste(First_Name, Last_Name))
sapcont<-read.csv("contoct.csv") |> 
  mutate(FullName=paste(First_Name, Last_Name))
contwav2[duplicated(contwav2$FullName) | duplicated(contwav2$FullName, fromLast = TRUE),]
 # write.csv("./contacts/errors/duplicatecontactssinsource.csv")
contwav2<-
  contwav2 |> 
  select(External_Key, Account_External_Key, FullName, CountryRegion, Function, House_Number, Street, City, Postal_Code, EMail,Contact_Owner_ID ) |> 
  mutate(source="legacy CRM") #|> 
  #unique() # Using unique till Dariusz changes the legacy files
sapcont<-
  sapcont |> 
  select(External_Key, Account_External_Key, FullName, CountryRegion, Function, House_Number, Street, City, Postal_Code, EMail,Contact_Owner_ID) |>
  mutate(source="S4-CAA200") |> 
  filter(!CountryRegion %in% c("AT","CH")) |> 
  mutate(External_Key=ifelse(External_Key=="","EMPTY IN SAP",External_Key)) |> 
  mutate(Account_External_Key=ifelse(Account_External_Key=="","EMPTY IN SAP",Account_External_Key))

fullcont<-rbind(contwav2,sapcont)

fullcont[duplicated(fullcont$FullName) | duplicated(fullcont$FullName, fromLast = T), ] |> 
  select(External_Key, FullName, source, matches(".")) |> 
  rename(Source=source) |> 
  arrange(FullName) |> 
  group_by(FullName) |> 
  mutate(same = +(n_distinct(Source) == 1)) |> 
  ungroup() |> 
  mutate(errorsource=ifelse(same==1, Source, "Both")) |> 
  select(-same) |> 
  select(External_Key, FullName, Source,errorsource, matches(".")) |>  # check if we need to send all, because several are same names in legacy
  #filter(errorsource=="legacy CRM") |> 
  #filter(errorsource=="S4-CAA200") |> 
  #filter(errorsource=="Both")
  left_join(employeecodes, by=c("Contact_Owner_ID"="Employee_ID")) |> 
  mutate(Contact_Owner_ID=ifelse(is.na(Name),Contact_Owner_ID,Name)) |> 
  select(-Name) |> 
  write.csv("./contacts/errors/duplicatecontacts.csv", row.names = F)
```
Ver 0.01 4 years ago			`---`
			`title: "Contacts"`
			`author: "Scary Scarecrow"`
			`date: "12/27/2021"`
			`output: html_document`
			`---`

			```{r setup, include=FALSE}
			`knitr::opts_chunk$set(echo = TRUE)`
wave 2 app 3 years ago			`strt <- Sys.time()`
Ver 0.01 4 years ago			`library(readxl)`
			`library(dplyr)`
			`library(lubridate)`
			`library(DT)`
			`library(tidyr)`

wave 2 app 3 years ago			`mutlstxlrdr <- function() {`
			`for (i in seq_along(sheet.na)) {`
			`colnames <-`
			unique(saptemplate[saptemplate$`Sheet Name` == snames[i], ]$Header)
			`df <- read.table("", col.names = colnames)`
			`assign(snames[i], df)`

			`}`
Ver 0.01 4 years ago			`}`
applied new codelist. Changed the report. 4 years ago
wave 2 app 3 years ago			`do.call(file.remove, list(list.files(`
			`"./contacts/errors/mandatory/", full.names = TRUE`
			`)))`
			`do.call(file.remove, list(list.files(`
			`"./contacts/errors/codelist/", full.names = TRUE`
			`)))`
			`do.call(file.remove, list(list.files(`
			`"./contacts/errors/length/", full.names = TRUE`
			`)))`
applied new codelist. Changed the report. 4 years ago			`do.call(file.remove, list(list.files("./contacts/summary/", full.names = TRUE)))`
			`do.call(file.remove, list(list.files("./contacts/output/", full.names = TRUE)))`
Ver 0.01 4 years ago			```


			`## Data transformation workflow`

			`Following is the proposed preliminary workflow for the data transformation project.`


			`>All file of a segment (contacts/accounts etc..) should be inside the relevant folder. Each folder should have one folder for all codelist files. All legacy data (one file for each country) should be inside the raw-data folder, named after each country. Another file having field definitions including name of the matching column from the legacy file should also be there.`

			`>Make sure that there are no hidden files inside the directory.`

wave 2 app 3 years ago			`### Employees`

			```{r}
			`# employeecodes<-read.csv("emp.csv")`
			`# employeecodes<-employeecodes \|> select(c(1,2))`
			`# employeecodesnew<-read.csv("./employees/empoct.csv") \|>`
			`# select(c(Employee_ID,First_Name,Last_Name)) \|>`
			`# mutate(Name=paste(First_Name, Last_Name)) \|>`
			`# select(Employee_ID,Name) \|>`
			`# rename(Employee.ID=Employee_ID)`
			`# employeecodes<-rbind(employeecodes,employeecodesnew) \|>`
			`# unique()`
			`employeecodes<-read.csv("./employees/empoct.csv") \|>`
			`mutate(Name=paste(First_Name, Last_Name)) \|>`
			`select(Employee_ID,Name)`

			```

Ver 0.01 4 years ago
			`### Code Lists`


			```{r Create List of Files, echo=TRUE, message=FALSE, warning=FALSE}

wave 2 app 3 years ago			`filenames <-`
			`list.files("./contacts/CodeList",`
			`pattern = "*.xlsx",`
			`full.names = T) # We can avoid creating a separate directory for code list. But organizing may be difficult. However, this can be explored further if we want transform all the data in one go i.e. not by functions (contacts, accounts etc.).`
Ver 0.01 4 years ago
			`# File paths`
			`print(filenames)`
			```


			`Check manually if the above list includes all the codelist files`
			`If correct, then read the files.`

			```{r codelistreader, echo=TRUE, message=FALSE, warning=FALSE}
wave 2 app 3 years ago			`sheet_names <-`
			`lapply(filenames, excel_sheets) # Creates a list of the sheet names`
			`codelist_files <- NULL`
			`for (i in seq_along(filenames)) {`
			`a <-`
			`lapply(excel_sheets(filenames[[i]]),`
			`read_excel,`
			`path = filenames[[i]],`
			`col_types = "text") # Reads the sheets of the excel files`
			`names(a) <-`
			`c(sheet_names[[i]]) # Renames them according to the sheet names extracted above`
			`codelist_files <- c(codelist_files, a)`
			`}`
Ver 0.01 4 years ago			`# Names of the files imported`
			`names(codelist_files)`
			`#codelist_files<-unique(codelist_files)`
ver 0.2 4 years ago			`codelist_files$Title`
Ver 0.01 4 years ago			```



			`### Templates`


			Let us now extract the data. Below we are reading only one file having all data related to `Contacts` from the legacy system.

			```{r readlegacyfilepath, echo=TRUE, message=FALSE, warning=FALSE}

wave 2 app 3 years ago			`oldfilepath <-`
			`list.files("./contacts/raw-data/",`
			`pattern = "*.xls",`
			`full.names = T)`
Ver 0.01 4 years ago			`print(oldfilepath)`
			```

			`Check it the list matches the actual files, manually.`

applied new codelist. Changed the report. 4 years ago			```{r readlegacyfiles, echo=TRUE, message=FALSE, warning=FALSE}
Ver 0.01 4 years ago
wave 2 app 3 years ago			`old_files <- NULL`
Ver 0.01 4 years ago
			`#read_excel(path = oldfilepath[[i]], sheet = 1)`
wave 2 app 3 years ago			`for (i in seq_along(oldfilepath)) {`
			`a<- read_excel(path = oldfilepath[[i]], sheet = 1)`
			`a<-a \|>`
			left_join(employeecodes, by=c(`Full Name (Owning User)` = "Name")) \|>
			select(-`Full Name (Owning User)`) \|>
			`mutate(Employee_ID=ifelse(is.na(Employee_ID),"99999",Employee_ID)) \|>`
			rename(`Full Name (Owning User)`=Employee_ID)
			`old_files[[i]]<-a`
			`}`
Ver 0.01 4 years ago
wave 2 app 3 years ago			`names(old_files) <- gsub("./contacts/raw-data/", "", oldfilepath)`
Ver 0.01 4 years ago			```


			`Some errors in the legacy file noticed. Columns with similar or same name exists.`



			```{r readSAPtemplate, echo=TRUE, message=FALSE, warning=FALSE}
wave 2 app 3 years ago			`saptemplate <-`
			`read_excel("./contacts/template.xlsx", sheet = "Field_Definitions")`
Ver 0.01 4 years ago			`# First few rows of the imported data`
			`head(saptemplate)`
			```


			`Please note that the format of the tables (sheet) has been slightly changed. Earlier the corresponding sheet name was mentioned in a row before the actual table. Now, all the rows mention the corresponding sheet name. This was done manually for convenience of data extraction`



wave 2 app 3 years ago

			```{r createmptySAPfiles, message=FALSE, warning=FALSE, include=FALSE}
Ver 0.01 4 years ago			`#orilo<-"en_US.UTF-8"`
			`#Sys.setlocale(locale="en_US.UTF-8")`
wave 2 app 3 years ago			`isValidEmail <- function(x) {`
			`grepl("\\<[A-Z0-9._%+-]+@[A-Z0-9.-]+\\.[A-Z]{2,}\\>", as.character(x), ignore.case=TRUE)`
			`}`
Ver 0.01 4 years ago			snames <- unique(saptemplate$`Sheet Name`)

			`for (h in seq_along(old_files)) {`
			`# Copy original data`
			`old.copy <- old_files[[h]]`
wave 2 app 3 years ago			`print(paste0(names(old_files[h]), " imported"))`
			`err.summ <-`
			`data.frame(`
			`Country = NULL,`
			`Name = NULL,`
			`Expected = NULL,`
			`Actual = NULL`
			`) #Error Cal`
Ver 0.01 4 years ago			`# Creates data frame for each sheet in snames`
			`for (i in seq_along(snames)) {`
wave 2 app 3 years ago			`print(paste0("Processing ..", snames[i]))`
Ver 0.01 4 years ago
wave 2 app 3 years ago			`if (snames[i] == "Contact") {`
			`# Select the column names from the field description sheet`
			`print("Creating template")`
			`sel.template.desc <-`
			saptemplate[saptemplate$`Sheet Name` == snames[i],]
			`print("Creating column names")`
			`sel.template.desc.colnames <- sel.template.desc$Header`
Ver 0.01 4 years ago
wave 2 app 3 years ago			`# Create a list by adding values from corresponding legacy data`
			`temp <- NULL`
			`print("adding values to template ")`
			`for (j in seq_along(sel.template.desc.colnames)) {`
			`temp[j] <-`
			`ifelse(`
			`!is.na(sel.template.desc$default[j]),`
			`as.character(as.vector(sel.template.desc$default[j])),`
			`ifelse(`
			`sel.template.desc$oldkey[j] == "NA" \|`
			`is.na(sel.template.desc$oldkey[j]),`
			`NA,`
			`as.vector(old.copy[, sel.template.desc$oldkey[j]])`
			`)`
			`)`


Ver 0.01 4 years ago			`}`

wave 2 app 3 years ago			`# Rename the columns according to field description`
			`print("renaming template ")`
			`names(temp) <- sel.template.desc.colnames`
Ver 0.01 4 years ago
wave 2 app 3 years ago			`# Create data frame from the list`
			`df <- as.data.frame(temp)`
			`print("Converted to data frame")`
			`df$Title <- ifelse(df$Title == "Mrs.", "Ms.", df$Title)`
			`# df$CountryRegion <- ifelse(!is.na(df$CountryRegion),`
			`# toupper(substr(names(old_files[h]), 2, 3)),`
			`# df$CountryRegion)`
			`df$CountryRegion <- toupper(substr(names(old_files[h]), 2, 3))`
Ver 0.01 4 years ago
wave 2 app 3 years ago			`# if(names(old_files)=="/CN.xlsx"){`
			`# df$Contact_Owner_ID<-"226"`
			`# }`
			`# if(names(old_files)=="/CZ.xlsx"){`
			`# df$Contact_Owner_ID<-"390"`
			`# }`
			`# if(names(old_files)=="/FI.xlsx"){`
			`# df$Contact_Owner_ID<-"325"`
			`# }`
			`# if(names(old_files)=="/DE.xlsx"){`
			`# df$Contact_Owner_ID<-"289"`
			`# }`
			`# if(names(old_files)=="/IT.xlsx"){`
			`# df$Contact_Owner_ID<-"182"`
			`# }`
			`# if(names(old_files)=="/PL.xlsx"){`
			`# df$Contact_Owner_ID<-"368"`
			`# }`
			`# if(names(old_files)=="/ES.xlsx"){`
			`# df$Contact_Owner_ID<-"447"`
			`# }`
			`# if(names(old_files)=="/SE.xlsx"){`
			`# df$Contact_Owner_ID<-"351"`
			`# }`
			`# if(names(old_files)=="/NL.xlsx"){`
			`# df$Contact_Owner_ID<-"90052"`
			`# }`
			`# if(names(old_files)=="/NO.xlsx"){`
			`# df$Contact_Owner_ID<-"000"`
			`# }`
Ver 0.01 4 years ago
wave 2 app 3 years ago			`# Error summary file`
			`Expected <- nrow(df)`
Ver 0.01 4 years ago
wave 2 app 3 years ago			`#Select essential rows`
			`print("Identifying essential rows")`
			`sel.template.desc \|>`
			`filter(Mandatory == "Yes") \|>`
			`pull(Header) -> essential.columns`
Ver 0.01 4 years ago

wave 2 app 3 years ago			`error.mandatory <- NULL`
			`error.df <-`
			`data.frame(`
			`Country = NULL,`
			`Name = NULL,`
			`Rows = NULL,`
			`Expected = NULL`
			`)`
			`# Operate on essential columns including creation of error file`
			`for (k in seq_along(essential.columns)) {`
			`if (essential.columns[k] == "Department") {`
			`print("Department found")`
			`#stop()`
			`df$Department <- paste0("Z", substr(names(old_files[h]), 2, 3))`
			`}`

			`print("Creating and writing data with missing mandatory values")`
			`manerrdt <- df[is.na(df[, essential.columns[k]]),]`
			`if (nrow(manerrdt > 0)) {`
			`manerrdt <-`
			`manerrdt \|> mutate(error = paste0(essential.columns[k], " missing"))`
			`}`
			`assign(`
			`paste0(`
			`"error_mandatory_",`
			`substr(names(old_files[h]), 2, 3),`
			`"_",`
			`snames[i],`
			`"_",`
			`essential.columns[k]`
			`),`
			`manerrdt`
			`)`
			`# TO be saved in error files`

			`if (nrow(manerrdt) > 0) {`
			`write.csv(`
			`manerrdt,`
			`paste0(`
			`"./contacts/errors/mandatory/",`
			`substr(names(old_files[h]), 2, 3),`
			`"_",`
			`snames[i],`
			`"_",`
			`essential.columns[k],`
			`"_error_mandatory.csv"`
			`),`
			`row.names = F,`
			`na = "",`
			`fileEncoding = "UTF-8"`
			`)`
			`}`
			`# Error summary file`
			`Country <- substr(names(old_files[h]), 2, 3)`
			`Name <- snames[i]`
			`err.type <- paste0("Missing ", essential.columns[k])`
			`err.count <- nrow(df[is.na(df[, essential.columns[k]]),])`


			`print("Removing rows with empty essential columns")`
			`df <- df[!is.na(df[, essential.columns[k]]),]`
			`if (err.count > 0) {`
			`error.df <-`
			`rbind(`
			`error.df,`
			`data.frame(`
			`Country = Country,`
			`Name = Name,`
			`err.type = err.type,`
			`err.count = err.count`
			`)`
			`) #Error cal`
			`}`
Ver 0.01 4 years ago			`}`
wave 2 app 3 years ago

			`print("Identifying columns associated with codelists")`
			`# List of columns that have a codelist`
			`codelistcols <- sel.template.desc \|>`
			filter(!is.na(`CodeList File Path`)) \|> pull(Header)
			`for (k in seq_along(codelistcols)) {`
			`print(paste0("Identifying errors ", codelistcols[k]))`
			`def.rows <-`
			`which(!df[, codelistcols[k]] %in% c(pull(codelist_files[codelistcols[k]][[1]], Description), NA))`
			`def.n <- df[def.rows, 1]`
			`def.rows.val <-`
			`df[!df[, codelistcols[k]] %in% c(pull(codelist_files[codelistcols[k]][[1]], Description), NA), codelistcols[k]]`
			`def.colname <-`
			`rep(codelistcols[k], length.out = length(def.rows))`
			`def <- data.frame(def.rows, def.n, def.rows.val, def.colname)`
			`if (nrow(def > 0)) {`
			`assign(paste0(`
			`"error_codematch_",`
			`substr(names(old_files[1]), 1, 2),`
			`"_",`
			`snames[i],`
			`"_",`
			`codelistcols[k]`
			`),`
			`def) # TO be saved`
			`write.csv(`
			`def,`
			`paste0(`
			`"./contacts/errors/codelist/",`
			`substr(names(old_files[h]), 2, 3),`
			`"_",`
			`snames[i],`
			`"_",`
			`codelistcols[k],`
			`"_error_codematch_.csv"`
			`),`
			`row.names = F,`
			`na = "",`
			`fileEncoding = "UTF-8"`
			`)`
			`}`
			`err.type <-`
			`paste0("Codelist Mismatch ", codelistcols[k]) #Error cal`
			`err.count <- nrow(def) #Error cal`
			`if (err.count > 0) {`
			`error.df <-`
			`rbind(`
			`error.df,`
			`data.frame(`
			`Country = Country,`
			`Name = Name,`
			`err.type = err.type,`
			`err.count = err.count`
			`)`
			`) #Error cal`
			`}`


			`print(paste0("Removing errors ", codelistcols[k]))`
			`# Removes any mismatch`
			`df[!df[, codelistcols[k]] %in% c(pull(codelist_files[codelistcols[k]][[1]], Description), NA), codelistcols[k]] <-`
			`NA`

			`# Matches each column with the corresponding code list and returns the value`
			`df[, codelistcols[k]] <-`
			`as.character(pull(codelist_files[codelistcols[k]][[1]], 2)[match(pull(df, codelistcols[k]),`
			`pull(codelist_files[codelistcols[k]][[1]], Description))])`

Ver 0.01 4 years ago			`}`
wave 2 app 3 years ago			max.length <- as.numeric(sel.template.desc$`Max Length`)
			dtype <- sel.template.desc$`Data Type`
			`rowval <- NULL`
			`ival <- NULL`
			`rval <- NULL`
			`lenght.issue.df <- NULL`
			`fname <- NULL`
			`lname <- NULL`
			`owner <- NULL`
			`# Changing the data class`
			`for (k in 1:ncol(df)) {`
			`if (dtype[k] == "String") {`
			`df[, k] <- as.character(pull(df, k))`
			`}`
			`if (dtype[k] == "Boolean") {`
			`df[, k] <- as.logical(pull(df, k))`
			`}`
			`if (dtype[k] == "DateTime") {`
			`df[, k] <- lubridate::ymd_hms(pull(df, k))`
			`}`
			`if (dtype[k] == "Time") {`
			`df[, k] <- lubridate::hms(pull(df, k))`

			`} # This list will increase and also change based on input date and time formats`



Ver 0.01 4 years ago			`}`
wave 2 app 3 years ago
			`print("Rectifying streetname")`
			`# Street and House Number`
			`if (any(colnames(df) == "Street")) {`
			`print("found steet")`
			`# stop()`
Ver 0.01 4 years ago
wave 2 app 3 years ago			`df$Streetname <- NA`
			`df$HouseNumber <- NA`
			`#df \|> extract("Street", "(\\D+)(\\d.*)")`
			`df <- tidyr::extract(df,`
			`"Street",`
			`c("Streetname", "HouseNumber"),`
			`"(\\D+)(\\d.*)")`
			`df <- df \|>`
			`select(-c("House_Number")) \|>`
			`rename(Street = Streetname, House_Number = HouseNumber) \|>`
			`select(all_of(sel.template.desc.colnames))`

			`}`
Ver 0.01 4 years ago
wave 2 app 3 years ago			`# Rectifying Phone, Mobile and Fax numbers`
			`if (any(colnames(df) == "Phone")) {`
			`print("Found Phone")`
			`df$Phone <- gsub("[+]", "00", df$Phone)`
			`}`
Ver 0.01 4 years ago
wave 2 app 3 years ago			`if (any(colnames(df) == "Mobile")) {`
			`print("Found Mobile")`
			`df$Mobile <- gsub("[+]", "00", df$Mobile)`
			`}`
Ver 0.01 4 years ago
wave 2 app 3 years ago			`if (any(colnames(df) == "Mobile")) {`
			`print("Found Mobile")`
			`df$Mobile <- gsub("[+]", "00", df$Mobile)`
			`}`
applied new codelist. Changed the report. 4 years ago

wave 2 app 3 years ago			`# Length Rectification`
			`colclasses <- lapply(df, class)`
			`print("Rectifying Length")`
			`for (k in 1:ncol(df)) {`
			`if (colclasses[[k]] == "character") {`
			`print("found character column ")`
			`rowval <- pull(df, 1)`
			`ival <-`
			`ifelse(nchar(pull(df, k)) == 0 \|`
			`is.na(nchar(pull(df, k))), 1, nchar(pull(df, k)))`
			`rval <- max.length[k]`
			`colval <- pull(df, k)`
			`colnm <- colnames(df)[k]`
			`cntr <- substr(names(old_files[h]), 2, 3)`
			`fname <- pull(df, 8)`
			`lname <- pull(df, 9)`
			`owner <- pull(df, 47)`
			`# rectifying data length`
			`df[, k] <-`
			`ifelse(nchar(pull(df, k)) > max.length[k],`
			`substring(pull(df, k), 1, max.length[k]),`
			`pull(df, k))`
			`}`
			`# Add name and email`
			`lenght.issue.df <-`
			`rbind(`
			`lenght.issue.df,`
			`data.frame(`
			`rowval,`
			`ival,`
			`rval,`
			`colnm,`
			`colval,`
			`cntr,`
			`fname,`
			`lname,`
			`owner`
			`)`
			`)`

			`err.type <-`
			`paste0("Length error ", colnames(df)[k]) # Error cal`
			`err.count <- sum(ival > rval, na.rm = T) # Error cal`
			`if (err.count > 0) {`
			`error.df <-`
			`rbind(`
			`error.df,`
			`data.frame(`
			`Country = Country,`
			`Name = Name,`
			`err.type = err.type,`
			`err.count = err.count`
			`)`
			`) #Error cal`
			`}`


Ver 0.01 4 years ago			`}`

wave 2 app 3 years ago			`lenght.issue.df <- dplyr::filter(lenght.issue.df, ival > rval)`
Ver 0.01 4 years ago

wave 2 app 3 years ago			`if (nrow(lenght.issue.df) > 0) {`
			`write.csv(`
			`lenght.issue.df,`
			`paste0(`
			`"./contacts/errors/length/",`
			`substr(names(old_files[h]), 2, 3),`
			`"_",`
			`snames[i],`
			`"_length_error.csv"`
			`),`
			`row.names = F,`
			`na = "",`
			`fileEncoding = "UTF-8"`
			`)`
			`}`

			`assign(snames[i], df)`
			`df<- df \|> mutate(EMail=ifelse(isValidEmail(EMail),EMail,"missing@leviat.com"))`
			`write.csv(`
			`df,`
			`paste0(`
			`"./contacts/output/",`
			`substr(names(old_files[h]), 2, 3),`
			`"_",`
			`snames[i],`
			`".csv"`
			`),`
			`row.names = F,`
			`sep=",",`
			`na = "",`
			`fileEncoding = "UTF-8"`
			`)`
			`if (nrow(error.df) > 0) {`
			`write.csv(`
			`error.df,`
			`paste0(`
			`"./contacts/summary/",`
			`substr(names(old_files[h]), 2, 3),`
			`"_",`
			`snames[i],`
			`"_error",`
			`".csv"`
			`),`
			`row.names = F,`
			`na = "",`
			`fileEncoding = "UTF-8"`
			`) # Error write`
			`}`
Ver 0.01 4 years ago			`}`

wave 2 app 3 years ago			`err.summ <-`
			`rbind(`
			`err.summ,`
			`data.frame(`
			`Country = Country,`
			`Name = Name,`
			`Expected = Expected,`
			`Actual = nrow(df)`
			`)`
			`) #Error Cal`
Ver 0.01 4 years ago
wave 2 app 3 years ago			`}`
			`write.csv(`
			`err.summ,`
			`paste0(`
			`"./contacts/summary/" ,`
Ver 0.01 4 years ago			`substr(names(old_files[h]), 2, 3),`
			`"_",`
			`snames[i],`
wave 2 app 3 years ago			`"_sumerror",`
			`".csv"`
			`),`
			`row.names = F,`
			`na = "",`
			`fileEncoding = "UTF-8"`
			`) # Error Write`
Ver 0.01 4 years ago			`}`

wave 2 app 3 years ago			`end <- Sys.time()`
Ver 0.01 4 years ago
wave 2 app 3 years ago			`end - strt`
Report Updated, Error Updated, Output Updated 4 years ago
Ver 0.01 4 years ago
			```
			`The code failed because Department Column appears several times in the data and while importing R renamed them to Department..xx).`
			`Manually verify if these are the required templates`

Report Updated, Error Updated, Output Updated 4 years ago			```{r}
wave 2 app 3 years ago			`opfilepath <-`
			`list.files("./contacts/output",`
			`pattern = "*.csv",`
			`full.names = T)`
			`opfiles <- lapply(opfilepath, read.csv, colClasses = "character", header=TRUE, row.names=NULL)`
			`opdf <- do.call(rbind.data.frame, opfiles)`
			`write.csv(`
			`opdf,`
			`"./contacts/output/combined/combined.csv",`
			`row.names = F,`
			`na = "",`
			`fileEncoding = "UTF-8"`
			`)`

			`openxlsx::write.xlsx(opdf,"./contacts/output/combined/combined.xlsx")`
Report Updated, Error Updated, Output Updated 4 years ago
			```
Ver 0.01 4 years ago



wave 2 app 3 years ago			`# Duplicate check`
Ver 0.01 4 years ago
wave 2 app 3 years ago			```{r}
			`contwav2<-read.csv("./contacts/output/combined/combined.csv") \|>`
			`mutate(FullName=paste(First_Name, Last_Name))`
			`sapcont<-read.csv("contoct.csv") \|>`
			`mutate(FullName=paste(First_Name, Last_Name))`
			`contwav2[duplicated(contwav2$FullName) \| duplicated(contwav2$FullName, fromLast = TRUE),]`
			`# write.csv("./contacts/errors/duplicatecontactssinsource.csv")`
			`contwav2<-`
			`contwav2 \|>`
			`select(External_Key, Account_External_Key, FullName, CountryRegion, Function, House_Number, Street, City, Postal_Code, EMail,Contact_Owner_ID ) \|>`
			`mutate(source="legacy CRM") #\|>`
			`#unique() # Using unique till Dariusz changes the legacy files`
			`sapcont<-`
			`sapcont \|>`
			`select(External_Key, Account_External_Key, FullName, CountryRegion, Function, House_Number, Street, City, Postal_Code, EMail,Contact_Owner_ID) \|>`
			`mutate(source="S4-CAA200") \|>`
			`filter(!CountryRegion %in% c("AT","CH")) \|>`
			`mutate(External_Key=ifelse(External_Key=="","EMPTY IN SAP",External_Key)) \|>`
			`mutate(Account_External_Key=ifelse(Account_External_Key=="","EMPTY IN SAP",Account_External_Key))`

			`fullcont<-rbind(contwav2,sapcont)`

			`fullcont[duplicated(fullcont$FullName) \| duplicated(fullcont$FullName, fromLast = T), ] \|>`
			`select(External_Key, FullName, source, matches(".")) \|>`
			`rename(Source=source) \|>`
			`arrange(FullName) \|>`
			`group_by(FullName) \|>`
			`mutate(same = +(n_distinct(Source) == 1)) \|>`
			`ungroup() \|>`
			`mutate(errorsource=ifelse(same==1, Source, "Both")) \|>`
			`select(-same) \|>`
			`select(External_Key, FullName, Source,errorsource, matches(".")) \|> # check if we need to send all, because several are same names in legacy`
			`#filter(errorsource=="legacy CRM") \|>`
			`#filter(errorsource=="S4-CAA200") \|>`
			`#filter(errorsource=="Both")`
			`left_join(employeecodes, by=c("Contact_Owner_ID"="Employee_ID")) \|>`
			`mutate(Contact_Owner_ID=ifelse(is.na(Name),Contact_Owner_ID,Name)) \|>`
			`select(-Name) \|>`
			`write.csv("./contacts/errors/duplicatecontacts.csv", row.names = F)`
			```
Ver 0.01 4 years ago