--- title: "EmplClean" author: "Scary Scarecrow" date: "4/28/2022" output: html_document --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) library(dplyr) library(fuzzyjoin) library(stringr) tricky_join<- function(x,y,by_x, pattern_y){ idx_x<- sapply(y[[pattern_y]], grep, x[[by_x]]) idx_y<-sapply(seq_along(idx_x), function(z) rep(z, length(idx_x[[z]]))) df<- dplyr::bind_cols(x[unlist(idx_x), , drop=F], y[unlist(idx_y), , drop=F]) return(df) } ``` ```{r} # unlist(sapply(emp[["Name"]], grep, users[["Full_Name"]])) # unlist(sapply(emp[["Name"]], grep, users[["Email.Address"]])) # unlist(sapply(emp[["Name"]], grep, users[["Display_Name"]])) # unlist(sapply(users[["Full_Name"]] , grep, emp[["Name"]])) # unlist(sapply(users[["Email.Address"]] , grep, emp[["Name"]])) # unlist(sapply(users[["Display_Name"]] , grep, emp[["Name"]])) # # users |> regex_inner_join(emp, by=c(Full_Name="Name")) # str_split(users$Display_Name, ", ") # # stringdist_join( emp, users, by=c("Name"="Description"), ignore_case=T, max_dist =5, method="lcs", mode="inner" ) #|> write.csv("test.csv") # # # regex_inner_join(emp, users, by=c("Name"="Display_Name", "Name"="Full_Name")) # match_func<- Vectorize(function(x,y) agrepl(x,y, max.distance = 0.7, ignore.case = T)) # emp |> fuzzy_inner_join(users, by=c("Name"="Display_Name", "Name"="Full_Name"),match_fun=match_func) emp<-read.csv("employees.csv") users<-read.csv("all_users.csv") |> dplyr::select(c(Display_Name, Full_Name, Country, Department, Email.Address, Home.Phone.Number, Job.Title, Mobile.Phone.Number, Office, State.Province, Telephone.Number, Username, Description)) |> unique() colnames(users) colnames(emp) emp1<-emp |> tidyr::separate(Name, into = c("FName","SName"), sep=" ", extra = "merge", remove = F) emp1 users<-users |> tidyr::separate(Display_Name,into=c("SName","FName"), sep=",", extra = "merge", remove = F) ``` ```{r} unique(users$Country) sapply(emp1$FName, grep, users$Display_Name, simplify = T) df<-NULL grepl("a",c("asitav","sen")) sapply(c("a","b","c"), grepl,c("asitav","sen","hen")) a1<-lapply(emp1$FName, grepl,users$Display_Name) a2<-lapply(emp1$SName, grepl,users$Display_Name) a3<-lapply(emp1$FName, grepl,users$Username) a4<-lapply(emp1$SName, grepl,users$Username) a5<-lapply(emp1$FName, grepl,users$Description) a6<-lapply(emp1$SName, grepl,users$Description) a7<-NULL for(i in 1:length(a1)){ a7[[i]]<- a1[[i]] + a2[[i]] + a3[[i]] + a4[[i]] + a5[[i]] + a6[[i]] } a8<-lapply(a7,which.max) a10<-lapply(a7,max) a9<-lapply(a8, function(x) ifelse(x%%length(users)!=0, x%%length(users),x/length(users))) a9 df<-NULL dt<-NULL score<-NULL for(i in 1:length(a9)){ df<-bind_cols(emp1[i, ], users[a9[[i]],]) score[i]<-a10[i] dt<-rbind(dt,df) } dt$score<-unlist(score) dt |> arrange(desc(score)) dat<-merge(emp1,users) nrow(dat) dat<-stringdist_join( emp1, users, by=c("FName"="FName","SName"="SName"), ignore_case=T, max_dist =3, method="lcs", mode="full" ) dat |> select(-c(2,3,7,8)) |> readr::write_excel_csv("test.csv",na="Match Not Found") ```