You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

138 lines
3.2 KiB

---
title: "EmplClean"
author: "Scary Scarecrow"
date: "4/28/2022"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(dplyr)
library(fuzzyjoin)
library(stringr)
tricky_join<- function(x,y,by_x, pattern_y){
idx_x<- sapply(y[[pattern_y]], grep, x[[by_x]])
idx_y<-sapply(seq_along(idx_x), function(z) rep(z, length(idx_x[[z]])))
df<- dplyr::bind_cols(x[unlist(idx_x), , drop=F],
y[unlist(idx_y), , drop=F])
return(df)
}
```
```{r}
# unlist(sapply(emp[["Name"]], grep, users[["Full_Name"]]))
# unlist(sapply(emp[["Name"]], grep, users[["Email.Address"]]))
# unlist(sapply(emp[["Name"]], grep, users[["Display_Name"]]))
# unlist(sapply(users[["Full_Name"]] , grep, emp[["Name"]]))
# unlist(sapply(users[["Email.Address"]] , grep, emp[["Name"]]))
# unlist(sapply(users[["Display_Name"]] , grep, emp[["Name"]]))
#
# users |> regex_inner_join(emp, by=c(Full_Name="Name"))
# str_split(users$Display_Name, ", ")
#
#
stringdist_join(
emp,
users,
by=c("Name"="Description"),
ignore_case=T,
max_dist =5,
method="lcs",
mode="inner"
) #|> write.csv("test.csv")
#
#
# regex_inner_join(emp, users, by=c("Name"="Display_Name", "Name"="Full_Name"))
# match_func<- Vectorize(function(x,y) agrepl(x,y, max.distance = 0.7, ignore.case = T))
# emp |> fuzzy_inner_join(users, by=c("Name"="Display_Name", "Name"="Full_Name"),match_fun=match_func)
emp<-read.csv("employees.csv")
users<-read.csv("all_users.csv") |> dplyr::select(c(Display_Name, Full_Name, Country, Department,
Email.Address, Home.Phone.Number,
Job.Title, Mobile.Phone.Number, Office, State.Province,
Telephone.Number, Username, Description)) |> unique()
colnames(users)
colnames(emp)
emp1<-emp |> tidyr::separate(Name, into = c("FName","SName"), sep=" ", extra = "merge", remove = F)
emp1
users<-users |>
tidyr::separate(Display_Name,into=c("SName","FName"), sep=",", extra = "merge", remove = F)
```
```{r}
unique(users$Country)
sapply(emp1$FName, grep, users$Display_Name, simplify = T)
df<-NULL
grepl("a",c("asitav","sen"))
sapply(c("a","b","c"), grepl,c("asitav","sen","hen"))
a1<-lapply(emp1$FName, grepl,users$Display_Name)
a2<-lapply(emp1$SName, grepl,users$Display_Name)
a3<-lapply(emp1$FName, grepl,users$Username)
a4<-lapply(emp1$SName, grepl,users$Username)
a5<-lapply(emp1$FName, grepl,users$Description)
a6<-lapply(emp1$SName, grepl,users$Description)
a7<-NULL
for(i in 1:length(a1)){
a7[[i]]<- a1[[i]] + a2[[i]] + a3[[i]] + a4[[i]] + a5[[i]] + a6[[i]]
}
a8<-lapply(a7,which.max)
a10<-lapply(a7,max)
a9<-lapply(a8, function(x) ifelse(x%%length(users)!=0, x%%length(users),x/length(users)))
a9
df<-NULL
dt<-NULL
score<-NULL
for(i in 1:length(a9)){
df<-bind_cols(emp1[i, ], users[a9[[i]],])
score[i]<-a10[i]
dt<-rbind(dt,df)
}
dt$score<-unlist(score)
dt |> arrange(desc(score))
dat<-merge(emp1,users)
nrow(dat)
dat<-stringdist_join(
emp1,
users,
by=c("FName"="FName","SName"="SName"),
ignore_case=T,
max_dist =3,
method="lcs",
mode="full"
)
dat |>
select(-c(2,3,7,8)) |> readr::write_excel_csv("test.csv",na="Match Not Found")
```