在 R 中使用并行计算进行异常检测

问题描述

我有一个超过 3 亿行的数据框,我想检测每个组中的异常,它由国家和 ID(每个组)组成,然后我编写了以下代码来检测异常点,但这需要很多时间.您能否建议任何其他可以使其更快的选项。数据帧格式:

 df <- data.frame("id" = 1:n,"country"= ("US",..),"date"=("2021-01-01","value"=c(10,....)) 


    registerDoParallel()
groupColumns <- c("country","id")
system.time(temp_anom <- ddply(df,groupColumns,function(x){
  x <- x[,c('date','value')]  
  resid.q <- quantile(x$value,prob = c(0.1,0.90))
  iqr <- diff(resid.q)
  limits <- resid.q + 3 * iqr * c(-1,1) 
  lower_bound <- limits[1]
  upper_bound <- limits[2]
  outlier_dip_index <- dplyr::filter(x,value < lower_bound) %>% data.frame() 
  if (nrow(outlier_dip_index) > 0) {
    outlier_dip_index$status <- "dip"}
  outlier_spike_index <- dplyr::filter(x,value > upper_bound) %>% data.frame()
  if (nrow(outlier_spike_index) > 0) {
    outlier_spike_index$status <- "spike"  
    outlier <- rbind(outlier_spike_index,outlier_dip_index)
    outlier
  }
},.paralle = T))

解决方法

为了提高并行计算的速度,我们需要确定 Doparallel 中的最佳内核数,在这种情况下,最佳内核数为 5。只有修改如下代码,我们才能看到种子的巨大改进。

Create table Movie([Movie_ID] int primary key not null,[Movie_Name] varchar(50) Unique not null,[Realease_Date] date not Null,[Last_Date] date,Runtime time(0) not null,Status varchar(20) not null,Rating float)

Create table [Showtime]([Showtime_ID] int primary key not null,Date date not null,Time time(0) not Null)
Create table [Cinema Halls]([Cinema_Halls_ID] int primary key not null,[Total_Seats] int not Null)

Create table [Movie Schedule] (
[Movie_Schedule_ID] int primary key not null,[Movie_ID] int  NOT null,[Showtime_ID] int not null,Cinema_Halls_ID int not null
Constraint fk_M_ID FOREIGN KEY ([Movie_ID]) REFERENCES Movie([Movie_ID]),Constraint fk_Sh_ID FOREIGN KEY ([Showtime_ID]) REFERENCES Showtime([Showtime_ID]),Constraint fk_C_ID FOREIGN KEY ([Cinema_Halls_ID]) REFERENCES [Cinema Halls] ([Cinema_Halls_ID])
)


/*Trigger  Stops duplicate booking of Cinema halls and invalid showtime of movie*/
Create Trigger Trigger_Movie_Shedule
On "Movie Schedule"
After Insert,Update
As
declare @Cinema_Halls_ID int,@Showtime_ID int,@Movie_ID int,@Release_Date Date,@Last_Date Date,@Showtime_Date date;

Select @Cinema_Halls_ID =Cinema_Halls_ID from inserted ;
Select @Showtime_ID=Showtime_ID from inserted;
Select @Movie_ID=Movie_ID from inserted;
Select Showtime_Date=Date from Showtime where Showtime_ID=@Showtime_ID
Select @Release_Date= Release_Date from Movie where Movie_ID=@Movie_ID;
Select @Last_Date=Last_Date from Movie where Movie_ID=@Movie_ID;

IF EXISTS (select count (Showtime_ID) from "Movie Schedule" 
where Showtime_ID = @Showtime_ID and Cinema_Halls_ID = @Cinema_Halls_ID )
BEGIN
   PRINT'This Cinema Hall is Already Booked'
   Rollback Transaction;
   return
END
ELSE IF (@Showtime_DATE >= @Release_Date and @Showtime_Date<= @Last_Date)
BEGIN
  PRINT'Movie Showtime not in Range'
  Rollback Transaction;
  return
END