问题描述
# set seed for reproducibility
set.seed(123)
df <- tibble(group = rep(c("a","b","c","d","e"),each = 20),values = c(rnorm(20,1),rnorm(20,1,2,3,4,1)))
我想使用汇总来获得分位数
df %>%
group_by(group) %>%
summarize(quantiles = quantile(values,c(0.25,0.75)))
df %>%
group_by(group) %>%
summarize(quantile0.25 = quantile(values,c(0.25)),quantile0.75 = quantile(values,c(0.75)))
其中之一。我不知道哪个更实用,将每行的分位数与两个变量或两行作为一个变量。
最后我想(最好在同一管道中)使用分位数来过滤原始数据帧中的异常值,而不是汇总数据帧中的每个相应组,例如
df %>%
group_by() %>%
summarize() %>%
filter()
其中每组按其各自的分位数+-1,5iqr 过滤。
这可能吗,最好的方法是什么? 我认为使用一个应用于所有组的过滤器值按组过滤会很简单,但如何为每个组应用不同的过滤器值?
解决方法
您可以编写一个函数来通过 IQR 检测异常值
Variable_name Value
Aborted_clients 8659
Aborted_connects 0
Binlog_cache_disk_use 0
Binlog_cache_use 0
Binlog_stmt_cache_disk_use 0
Binlog_stmt_cache_use 0
Bytes_received 10528823106
Bytes_sent 11879095113
Com_admin_commands 114321
Com_assign_to_keycache 0
Com_alter_db 0
Com_alter_db_upgrade 0
Com_alter_event 0
Com_alter_function 0
Com_alter_procedure 0
Com_alter_server 0
Com_alter_table 0
Com_alter_tablespace 0
Com_analyze 0
Com_begin 0
Com_binlog 0
Com_call_procedure 0
Com_change_db 2494
Com_change_master 0
Com_check 0
Com_checksum 0
Com_commit 21669803
Com_create_db 0
Com_create_event 0
Com_create_function 0
Com_create_index 0
Com_create_procedure 0
Com_create_server 0
Com_create_table 0
Com_create_trigger 0
Com_create_udf 0
Com_create_user 0
Com_create_view 0
Com_dealloc_sql 0
Com_delete 57847
Com_delete_multi 2
Com_do 0
Com_drop_db 0
Com_drop_event 0
Com_drop_function 0
Com_drop_index 0
Com_drop_procedure 0
Com_drop_server 0
Com_drop_table 0
Com_drop_trigger 0
Com_drop_user 0
Com_drop_view 0
Com_empty_query 0
Com_execute_sql 0
Com_flush 1
Com_grant 0
Com_ha_close 0
Com_ha_open 0
Com_ha_read 0
Com_help 0
Com_insert 10384315
Com_insert_select 4493
Com_install_plugin 0
Com_kill 0
Com_load 0
Com_lock_tables 0
Com_optimize 0
Com_preload_keys 0
Com_prepare_sql 0
Com_purge 0
Com_purge_before_date 0
Com_release_savepoint 0
Com_rename_table 0
Com_rename_user 0
Com_repair 0
Com_replace 0
Com_replace_select 0
Com_reset 0
Com_resignal 0
Com_revoke 0
Com_revoke_all 0
Com_rollback 0
Com_rollback_to_savepoint 0
Com_savepoint 0
Com_select 31617317
Com_set_option 585637
Com_signal 0
Com_show_authors 0
Com_show_binlog_events 0
Com_show_binlogs 0
Com_show_charsets 0
Com_show_collations 0
Com_show_contributors 0
Com_show_create_db 0
Com_show_create_event 0
Com_show_create_func 0
Com_show_create_proc 0
Com_show_create_table 0
Com_show_create_trigger 0
Com_show_databases 1
Com_show_engine_logs 0
Com_show_engine_mutex 0
Com_show_engine_status 4415
Com_show_events 0
Com_show_errors 0
Com_show_fields 7780
Com_show_function_status 0
Com_show_grants 0
Com_show_keys 0
Com_show_master_status 0
Com_show_open_tables 0
Com_show_plugins 0
Com_show_privileges 0
Com_show_procedure_status 0
Com_show_processlist 0
Com_show_profile 0
Com_show_profiles 0
Com_show_relaylog_events 0
Com_show_slave_hosts 0
Com_show_slave_status 0
Com_show_status 4416
Com_show_storage_engines 0
Com_show_table_status 0
Com_show_tables 2492
Com_show_triggers 0
Com_show_variables 9040
Com_show_warnings 11880457
Com_slave_start 0
Com_slave_stop 0
Com_stmt_close 48226
Com_stmt_execute 90467
Com_stmt_fetch 0
Com_stmt_prepare 48354
Com_stmt_reprepare 0
Com_stmt_reset 0
Com_stmt_send_long_data 218
Com_truncate 16
Com_uninstall_plugin 0
Com_unlock_tables 0
Com_update 11304832
Com_update_multi 0
Com_xa_commit 0
Com_xa_end 0
Com_xa_prepare 0
Com_xa_recover 0
Com_xa_rollback 0
Com_xa_start 0
Compression ON
Connections 692888
Created_tmp_disk_tables 969524
Created_tmp_files 152
Created_tmp_tables 1104523
Delayed_errors 0
Delayed_insert_threads 0
Delayed_writes 0
Flush_commands 1
Handler_commit 74944534
Handler_delete 9802964
Handler_discover 0
Handler_prepare 0
Handler_read_first 333333
Handler_read_key 61866586
Handler_read_last 3454093
Handler_read_next 13704879326
Handler_read_prev 2773935641
Handler_read_rnd 39932346
Handler_read_rnd_next 9899032663
Handler_rollback 170160
Handler_savepoint 0
Handler_savepoint_rollback 0
Handler_update 40464627
Handler_write 180607735
Innodb_buffer_pool_pages_data 768282
Innodb_buffer_pool_bytes_data 12587532288
Innodb_buffer_pool_pages_dirty 10741
Innodb_buffer_pool_bytes_dirty 175980544
Innodb_buffer_pool_pages_flushed 39908997
Innodb_buffer_pool_pages_free 563241
Innodb_buffer_pool_pages_misc 110237
Innodb_buffer_pool_pages_total 1441760
Innodb_buffer_pool_read_ahead_rnd 0
Innodb_buffer_pool_read_ahead 24195
Innodb_buffer_pool_read_ahead_evicted 0
Innodb_buffer_pool_read_requests 61256179019
Innodb_buffer_pool_reads 519946
Innodb_buffer_pool_wait_free 0
Innodb_buffer_pool_write_requests 307699280
Innodb_data_fsyncs 22277830
Innodb_data_pending_fsyncs 1
Innodb_data_pending_reads 0
Innodb_data_pending_writes 0
Innodb_data_read 10761654272
Innodb_data_reads 656718
Innodb_data_writes 59599294
Innodb_data_written 1327119237120
Innodb_dblwr_pages_written 39909506
Innodb_dblwr_writes 2757000
Innodb_have_atomic_builtins ON
Innodb_log_waits 0
Innodb_log_write_requests 22935831
Innodb_log_writes 16721934
Innodb_os_log_fsyncs 16783015
Innodb_os_log_pending_fsyncs 1
Innodb_os_log_pending_writes 0
Innodb_os_log_written 19349207552
Innodb_page_size 16384
Innodb_pages_created 111573
Innodb_pages_read 656709
Innodb_pages_written 39909506
Innodb_row_lock_current_waits 0
Innodb_row_lock_time 78010
Innodb_row_lock_time_avg 84
Innodb_row_lock_time_max 3217
Innodb_row_lock_waits 926
Innodb_rows_deleted 9802964
Innodb_rows_inserted 10494572
Innodb_rows_read 25829527401
Innodb_rows_updated 13355863
Innodb_truncated_status_writes 0
Key_blocks_not_flushed 0
Key_blocks_unused 1674
Key_blocks_used 21
Key_read_requests 358190032
Key_reads 0
Key_write_requests 116148559
Key_writes 0
Last_query_cost 0.000000
Max_used_connections 247
Not_flushed_delayed_rows 0
Open_files 48
Open_streams 0
Open_table_definitions 325
Open_tables 597
Opened_files 3878714
Opened_table_definitions 341
Opened_tables 620
Performance_schema_cond_classes_lost 0
Performance_schema_cond_instances_lost 0
Performance_schema_file_classes_lost 0
Performance_schema_file_handles_lost 0
Performance_schema_file_instances_lost 0
Performance_schema_locker_lost 0
Performance_schema_mutex_classes_lost 0
Performance_schema_mutex_instances_lost 0
Performance_schema_rwlock_classes_lost 0
Performance_schema_rwlock_instances_lost 0
Performance_schema_table_handles_lost 0
Performance_schema_table_instances_lost 0
Performance_schema_thread_classes_lost 0
Performance_schema_thread_instances_lost 0
Prepared_stmt_count 128
Qcache_free_blocks 0
Qcache_free_memory 0
Qcache_hits 0
Qcache_inserts 0
Qcache_lowmem_prunes 0
Qcache_not_cached 0
Qcache_queries_in_cache 0
Qcache_total_blocks 0
Queries 88430604
Questions 88219702
Rpl_status AUTH_MASTER
Select_full_join 3100
Select_full_range_join 806
Select_range 14288643
Select_range_check 0
Select_scan 225040
Slave_heartbeat_period 0.000
Slave_open_temp_tables 0
Slave_received_heartbeats 0
Slave_retried_transactions 0
Slave_running OFF
Slow_launch_threads 0
Slow_queries 18
Sort_merge_passes 147
Sort_range 7993917
Sort_rows 23603619
Sort_scan 934553
Ssl_accept_renegotiates 0
Ssl_accepts 0
Ssl_callback_cache_hits 0
Ssl_cipher
Ssl_cipher_list
Ssl_client_connects 0
Ssl_connect_renegotiates 0
Ssl_ctx_verify_depth 0
Ssl_ctx_verify_mode 0
Ssl_default_timeout 0
Ssl_finished_accepts 0
Ssl_finished_connects 0
Ssl_session_cache_hits 0
Ssl_session_cache_misses 0
Ssl_session_cache_mode NONE
Ssl_session_cache_overflows 0
Ssl_session_cache_size 0
Ssl_session_cache_timeouts 0
Ssl_sessions_reused 0
Ssl_used_session_cache_entries 0
Ssl_verify_depth 0
Ssl_verify_mode 0
Ssl_version
Table_locks_immediate 67102620
Table_locks_waited 0
Tc_log_max_pages_used 0
Tc_log_page_size 0
Tc_log_page_waits 0
Threads_cached 58
Threads_connected 102
Threads_created 674
Threads_running 5
Uptime 84415
Uptime_since_flush_status 84415
然后你就可以在过滤器中使用它
is_iqr_outlier <- function(x) {
q <- quantile(x,c(0.25,0.75))
iqr <- diff(q)
(x < q[1] - 1.5*iqr) | (x > q[2] + 1.5*iqr)
}
过滤器将按组运行。您的样本数据似乎没有任何异常值,因此它不是一个很好的测试用例。