问题描述
我有一个超过 3 亿行的数据框,我想检测每个组中的异常,它由国家和 ID(每个组)组成,然后我编写了以下代码来检测异常点,但这需要很多时间.您能否建议任何其他可以使其更快的选项。数据帧格式:
df <- data.frame("id" = 1:n,"country"= ("US",..),"date"=("2021-01-01","value"=c(10,....))
registerDoParallel()
groupColumns <- c("country","id")
system.time(temp_anom <- ddply(df,groupColumns,function(x){
x <- x[,c('date','value')]
resid.q <- quantile(x$value,prob = c(0.1,0.90))
iqr <- diff(resid.q)
limits <- resid.q + 3 * iqr * c(-1,1)
lower_bound <- limits[1]
upper_bound <- limits[2]
outlier_dip_index <- dplyr::filter(x,value < lower_bound) %>% data.frame()
if (nrow(outlier_dip_index) > 0) {
outlier_dip_index$status <- "dip"}
outlier_spike_index <- dplyr::filter(x,value > upper_bound) %>% data.frame()
if (nrow(outlier_spike_index) > 0) {
outlier_spike_index$status <- "spike"
outlier <- rbind(outlier_spike_index,outlier_dip_index)
outlier
}
},.paralle = T))
解决方法
为了提高并行计算的速度,我们需要确定 Doparallel 中的最佳内核数,在这种情况下,最佳内核数为 5。只有修改如下代码,我们才能看到种子的巨大改进。
Create table Movie([Movie_ID] int primary key not null,[Movie_Name] varchar(50) Unique not null,[Realease_Date] date not Null,[Last_Date] date,Runtime time(0) not null,Status varchar(20) not null,Rating float)
Create table [Showtime]([Showtime_ID] int primary key not null,Date date not null,Time time(0) not Null)
Create table [Cinema Halls]([Cinema_Halls_ID] int primary key not null,[Total_Seats] int not Null)
Create table [Movie Schedule] (
[Movie_Schedule_ID] int primary key not null,[Movie_ID] int NOT null,[Showtime_ID] int not null,Cinema_Halls_ID int not null
Constraint fk_M_ID FOREIGN KEY ([Movie_ID]) REFERENCES Movie([Movie_ID]),Constraint fk_Sh_ID FOREIGN KEY ([Showtime_ID]) REFERENCES Showtime([Showtime_ID]),Constraint fk_C_ID FOREIGN KEY ([Cinema_Halls_ID]) REFERENCES [Cinema Halls] ([Cinema_Halls_ID])
)
/*Trigger Stops duplicate booking of Cinema halls and invalid showtime of movie*/
Create Trigger Trigger_Movie_Shedule
On "Movie Schedule"
After Insert,Update
As
declare @Cinema_Halls_ID int,@Showtime_ID int,@Movie_ID int,@Release_Date Date,@Last_Date Date,@Showtime_Date date;
Select @Cinema_Halls_ID =Cinema_Halls_ID from inserted ;
Select @Showtime_ID=Showtime_ID from inserted;
Select @Movie_ID=Movie_ID from inserted;
Select Showtime_Date=Date from Showtime where Showtime_ID=@Showtime_ID
Select @Release_Date= Release_Date from Movie where Movie_ID=@Movie_ID;
Select @Last_Date=Last_Date from Movie where Movie_ID=@Movie_ID;
IF EXISTS (select count (Showtime_ID) from "Movie Schedule"
where Showtime_ID = @Showtime_ID and Cinema_Halls_ID = @Cinema_Halls_ID )
BEGIN
PRINT'This Cinema Hall is Already Booked'
Rollback Transaction;
return
END
ELSE IF (@Showtime_DATE >= @Release_Date and @Showtime_Date<= @Last_Date)
BEGIN
PRINT'Movie Showtime not in Range'
Rollback Transaction;
return
END