You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
138 lines
3.2 KiB
138 lines
3.2 KiB
---
|
|
title: "EmplClean"
|
|
author: "Scary Scarecrow"
|
|
date: "4/28/2022"
|
|
output: html_document
|
|
---
|
|
|
|
```{r setup, include=FALSE}
|
|
knitr::opts_chunk$set(echo = TRUE)
|
|
library(dplyr)
|
|
library(fuzzyjoin)
|
|
library(stringr)
|
|
|
|
|
|
|
|
tricky_join<- function(x,y,by_x, pattern_y){
|
|
idx_x<- sapply(y[[pattern_y]], grep, x[[by_x]])
|
|
idx_y<-sapply(seq_along(idx_x), function(z) rep(z, length(idx_x[[z]])))
|
|
df<- dplyr::bind_cols(x[unlist(idx_x), , drop=F],
|
|
y[unlist(idx_y), , drop=F])
|
|
return(df)
|
|
}
|
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
```{r}
|
|
# unlist(sapply(emp[["Name"]], grep, users[["Full_Name"]]))
|
|
# unlist(sapply(emp[["Name"]], grep, users[["Email.Address"]]))
|
|
# unlist(sapply(emp[["Name"]], grep, users[["Display_Name"]]))
|
|
# unlist(sapply(users[["Full_Name"]] , grep, emp[["Name"]]))
|
|
# unlist(sapply(users[["Email.Address"]] , grep, emp[["Name"]]))
|
|
# unlist(sapply(users[["Display_Name"]] , grep, emp[["Name"]]))
|
|
#
|
|
# users |> regex_inner_join(emp, by=c(Full_Name="Name"))
|
|
# str_split(users$Display_Name, ", ")
|
|
#
|
|
#
|
|
stringdist_join(
|
|
emp,
|
|
users,
|
|
by=c("Name"="Description"),
|
|
ignore_case=T,
|
|
max_dist =5,
|
|
method="lcs",
|
|
mode="inner"
|
|
) #|> write.csv("test.csv")
|
|
#
|
|
#
|
|
# regex_inner_join(emp, users, by=c("Name"="Display_Name", "Name"="Full_Name"))
|
|
# match_func<- Vectorize(function(x,y) agrepl(x,y, max.distance = 0.7, ignore.case = T))
|
|
# emp |> fuzzy_inner_join(users, by=c("Name"="Display_Name", "Name"="Full_Name"),match_fun=match_func)
|
|
|
|
emp<-read.csv("employees.csv")
|
|
users<-read.csv("all_users.csv") |> dplyr::select(c(Display_Name, Full_Name, Country, Department,
|
|
Email.Address, Home.Phone.Number,
|
|
Job.Title, Mobile.Phone.Number, Office, State.Province,
|
|
Telephone.Number, Username, Description)) |> unique()
|
|
|
|
colnames(users)
|
|
colnames(emp)
|
|
emp1<-emp |> tidyr::separate(Name, into = c("FName","SName"), sep=" ", extra = "merge", remove = F)
|
|
|
|
emp1
|
|
|
|
users<-users |>
|
|
tidyr::separate(Display_Name,into=c("SName","FName"), sep=",", extra = "merge", remove = F)
|
|
```
|
|
|
|
|
|
|
|
```{r}
|
|
unique(users$Country)
|
|
|
|
sapply(emp1$FName, grep, users$Display_Name, simplify = T)
|
|
|
|
df<-NULL
|
|
|
|
grepl("a",c("asitav","sen"))
|
|
sapply(c("a","b","c"), grepl,c("asitav","sen","hen"))
|
|
a1<-lapply(emp1$FName, grepl,users$Display_Name)
|
|
a2<-lapply(emp1$SName, grepl,users$Display_Name)
|
|
a3<-lapply(emp1$FName, grepl,users$Username)
|
|
a4<-lapply(emp1$SName, grepl,users$Username)
|
|
a5<-lapply(emp1$FName, grepl,users$Description)
|
|
a6<-lapply(emp1$SName, grepl,users$Description)
|
|
a7<-NULL
|
|
|
|
for(i in 1:length(a1)){
|
|
a7[[i]]<- a1[[i]] + a2[[i]] + a3[[i]] + a4[[i]] + a5[[i]] + a6[[i]]
|
|
}
|
|
|
|
a8<-lapply(a7,which.max)
|
|
a10<-lapply(a7,max)
|
|
a9<-lapply(a8, function(x) ifelse(x%%length(users)!=0, x%%length(users),x/length(users)))
|
|
a9
|
|
df<-NULL
|
|
dt<-NULL
|
|
score<-NULL
|
|
for(i in 1:length(a9)){
|
|
df<-bind_cols(emp1[i, ], users[a9[[i]],])
|
|
score[i]<-a10[i]
|
|
dt<-rbind(dt,df)
|
|
}
|
|
dt$score<-unlist(score)
|
|
dt |> arrange(desc(score))
|
|
dat<-merge(emp1,users)
|
|
nrow(dat)
|
|
|
|
dat<-stringdist_join(
|
|
emp1,
|
|
users,
|
|
by=c("FName"="FName","SName"="SName"),
|
|
ignore_case=T,
|
|
max_dist =3,
|
|
method="lcs",
|
|
mode="full"
|
|
)
|
|
dat |>
|
|
select(-c(2,3,7,8)) |> readr::write_excel_csv("test.csv",na="Match Not Found")
|
|
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|