-
Notifications
You must be signed in to change notification settings - Fork 0
/
anomaly.R
84 lines (69 loc) · 2.73 KB
/
anomaly.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# detect anomalies in healthcare claims data ----
# load libraries ----
source("https://tinyurl.com/libraryQ")
library("igraph")
library("dplyr")
library("magrittr")
library("tibble")
library("tidyr")
# read the data ----
# from : Medicare Physician and Other Supplier Data CY 2012
# source : http://tinyurl.com/j4bxtj9
required <- c(1, 14, 17, 22)
cols <- 1:28
cols <- ifelse(cols %in% required, "character", "NULL")
input <- read.csv("data_pagerank.csv"
, skip = 1
, colClasses = cols
, strip.white = TRUE
, na.strings = "NA"
, nrows = 10000
)
names(input) <- c("doctor", "speciality", "procedure", "counts")
input <- mutate(input, counts = as.integer(gsub(",", "", counts)))
# function to get doctors for a particular speciality
get_doctors <- function(sp){
input$doctor[input$speciality == sp]
}
# all doctor-speciality combinations ----
doc_spe <- input %>%
group_by(doctor, speciality) %>%
ungroup() %>%
select(doctor, speciality) %>%
unique
# create adjacency matrix ----
adj_df <- input %>%
group_by(doctor, procedure) %>%
summarise(counts = sum(counts)) %>%
ungroup %>%
spread(key = procedure, value = counts, fill = 0)
adj_mat <- adj_df[,-1] %>% as.matrix %>% dist %>% as.matrix
adj_mat <- ifelse(adj_mat > 0, 1/adj_mat, 1)
diag(adj_mat) <- 0
rownames(adj_mat) <- adj_df[["doctor"]]
# create graph object ----
gr <- graph_from_adjacency_matrix(adj_mat
, mode = "undirected"
, weighted = TRUE)
# plot(gr)
# function to get personalized weights for a speciality
get_weights <- function(sp, weight = 25){
ifelse(adj_df[["doctor"]] %in% get_doctors(sp)
, weight
, 1)
}
# run personalized pagerank algorithm on the graph object ----
set.seed(1)
pr <- page_rank(gr, personalized = get_weights("Gastroenterology"))
# function to get top 'k' scores
get_scores <- function(k){
data.frame(pr$vector) %>%
transmute(doctor = adj_df[["doctor"]], score = pr.vector) %>%
left_join(doc_spe, by = "doctor") %>%
arrange(desc(score)) %>%
head(k)
}
# observe top 'k' scores for "Gastroenterology" ----
print(get_scores(30))
# observe that 13, 14, 15 th rows are the anomalies
rm(list = ls())