第一步:將數據格式化爲事務矩陣
# Import relevant packages
library(dplyr)
library(tidyverse)
library(arulesSequences)
#Import standard transaction data
transactions = read.csv("transactions.csv")
# Start time of data to be considered
start_month <- "2015-07-01"
# Create list of Azure services by customer ID and CleanMonth (formatted dates)
trans_sequence <- transactions %>%
group_by(CustomerID, CleanMonth) %>%
summarize(
SIZE = n(),
ServiceLevel = paste(as.character(ServiceLevel), collapse = ';')
)
# Make event and sequence IDs into factors
elapsed_months <- function(end_date, start_date) {
ed <- as.POSIXlt(end_date)
sd <- as.POSIXlt(start_date)
12 * (ed$year - sd$year) + (ed$mon - sd$mon)
}
trans_sequence$eventID <- elapsed_months(trans_sequence$CleanMonth, start_month)
trans_sequence = trans_sequence[,c(1,5,3,4)]
names(trans_sequence) = c("sequenceID", "eventID", "SIZE", "items")
trans_sequence <- data.frame(lapply(trans_sequence, as.factor))
trans_sequence <- trans_sequence[order(trans_sequence$sequenceID, trans_sequence$eventID),]
# Convert to transaction matrix data type
write.table(trans_sequence, "mytxtout.txt", sep=";", row.names = FALSE, col.names = FALSE, quote = FALSE)
trans_matrix <- read_baskets("mytxtout.txt", sep = ";", info = c("sequenceID","eventID","SIZE"))
第二步 運行spade算法
s1 <- cspade(trans_matrix, parameter = list(support = 0.3), control = list(verbose = TRUE))
s1.df <- as(s1, "data.frame")
summary(s1)
第三步:找到並解釋序列規則
# Get induced temporal rules from frequent itemsets
r1 <- as(ruleInduction(s1, confidence = 0.5, control = list(verbose = TRUE)), "data.frame")
返回的數據見圖4.我們可以將rule列解釋爲在接下來的幾個月內,哪些Azure服務包會需要額外的服務。此外,對於所有規則來說,我們可以通過3個維度去比較:支持度,置信度和提升度。
# Separate LHS and RHS rules
r1$rulecount <- as.character(r1$rule)
max_col <- max(sapply(strsplit(r1$rulecount,' => '),length))
r_sep <- separate(data = r1, col = rule, into = paste0("Time",1:max_col), sep = " => ")
r_sep$Time2 <- substring(r_sep$Time2,3,nchar(r_sep$Time2)-2)
# Strip LHS baskets
max_time1 <- max(sapply(strsplit(r_sep$Time1,'},'),length))
r_sep$TimeClean <- substring(r_sep$Time1,3,nchar(r_sep$Time1)-2)
r_sep$TimeClean <- gsub("\\},\\{", "zzz", r_sep$TimeClean)
r_sep_items <- separate(data = r_sep, col = TimeClean, into = paste0("Previous_Items",1:max_time1), sep = "zzz")
# Get cleaned temporal rules: time reads sequentially from left to right
r_shift_na <- r_sep_items
for (i in seq(1, nrow(r_shift_na))){
for (col in seq(8, (6+max_time1))){
if (is.na(r_shift_na[i,col])==TRUE){
r_shift_na[i,col] <- r_shift_na[i,col-1]
r_shift_na[i,col-1] <- NA
}
}
}
names(r_shift_na)[2] <- "Predicted_Items"
cols <- c(7:(6+max_time1), 2:5)
temporal_rules <- r_shift_na[,cols]
temporal_rules <- temporal_rules[order(-temporal_rules$lift, -temporal_rules$confidence,-temporal_rules$support, temporal_rules$Predicted_Items),]
write.csv(as.data.frame(temporal_rules), file = "TemporalRules.csv", row.names = FALSE, na="")
# Get unique frequent itemsets existing in rules (subset of those in s1.df)
baskets_only <- temporal_rules[,1:(ncol(temporal_rules)-3)]
basket_mat <- as.vector(as.matrix(baskets_only))
freq_itemsets_in_rules <- unique(basket_mat[!is.na(basket_mat)])
write.csv(as.data.frame(freq_itemsets_in_rules), file = "FreqItemsetsInRules.csv", row.names = FALSE)
考慮的事情:
參考文獻:
-
Srikant, R., Agrawal, R., Apers, P., Bouzeghoub, M., Gardarin, G. (1996) "Mining sequential patterns: Generalizations and performance improvements", Advances in Database Technology — EDBT '96 vol. 1 no. 17.
-
Aileen P. Wright, Adam T. Wright, Allison B. McCoy, Dean F. Sittig, The use of sequential pattern mining to predict next prescribed medications, Journal of Biomedical Informatics, Volume 53, 2015, Pages 73-80, ISSN 1532-0464, https://doi.org/10.1016/j.jbi.2014.09.003.
-
Song SJ., Huang Z., Hu HP., Jin SY. (2004) A Sequential Pattern Mining Algorithm for Misuse Intrusion Detection. In: Jin H., Pan Y., Xiao N., Sun J. (eds) Grid and Cooperative Computing - GCC 2004 Workshops. GCC 2004. Lecture Notes in Computer Science, vol 3252. Springer, Berlin, Heidelberg.
-
Package ‘arulesSequences’ documentation: https://cran.r-project.org/web/packages/arulesSequences/arulesSequences.pdf.
-
J. Zaki. (2001). SPADE: An Efficient Algorithm for Mining Frequent Sequences. Machine Learning Journal, 42, 31–60.
-
Data Mining Algorithms in R: https://en.wikibooks.org/wiki/Data_Mining_Algorithms_In_R/Sequence_Mining/SPADE.
-
Analyzing Transaction Data like a Data Scientist: https://rpubs.com/Mahsa_A/Part4_AnalyzeTransactionData.