Datacamp: Data Manipulation with dplyr

Chapter 1

1. Selecting columns

counties %>%
  # Select the columns
  select(state, county, population, poverty)

2. Arranging observations

counties_selected <- counties %>%
  select(state, county, population, private_work, public_work, self_employed)

counties_selected %>%
  # Add a verb to sort in descending order of public_work
  arrange(desc(public_work))

3. Filtering for conditions

counties_selected <- counties %>%
  select(state, county, population)

counties_selected %>%
  # Filter for counties with a population above 1000000
  filter(state == "California", population > 1000000)

4. Mutate

counties_selected <- counties %>%
  select(state, county, population, public_work)

counties_selected %>%
  # Add a new column public_workers with the number of people employed in public work
  mutate(public_workers = population * public_work / 100) %>%
  # Sort in descending order of the public_workers column
  arrange(desc(public_workers))

5. Conclusion

counties %>%
  # Select the five columns 
  select(state, county, population, men, women) %>%
  # Add the proportion_men variable
  mutate(proportion_men = men / population) %>%
  # Filter for population of at least 10,000
  filter(population >= 10000) %>%
  # Arrange proportion of men in descending order 
  arrange(desc(proportion_men))

Chapter 2

1. The count verb

eg1. Use count() to find the number of counties in each region, using a second argument to sort in descending order.

# Use count to find the number of counties in each region
counties_selected %>%
  count(region, sort = TRUE)

eg2. Count the number of counties in each state, weighted based on the citizens column, and sorted in descending order.

# Find number of counties per state, weighted by citizens, sorted in descending order
counties_selected %>%
  count(state, wt = citizens, sort = TRUE)

2. The group by, summarize and ungroup verbs

eg1.

counties_selected %>%
  # Summarize to find minimum population, maximum unemployment, and average income
  summarize(min_population = min(population), max_unemployment = max(unemployment), average_income = mean(income))

eg2. Group the data by state, and summarize to create the columns total_area (with total area in square miles) and total_population (with total population).

counties_selected %>%
  # Group by state 
  group_by(state) %>%
  # Find the total area and population
  summarize(total_area = sum(land_area), total_population = sum(population))

eg3. Add a density column with the people per square mile, then arrange in descending order.

counties_selected %>%
  group_by(state) %>%
  summarize(total_area = sum(land_area),
            total_population = sum(population)) %>%
  # Add a density column
  mutate(density = total_population / total_area) %>%
  # Sort by density in descending order
  arrange(desc(density))

eg4. Summarize to find the total population, as a column called total_pop, in each combination of region and state

counties_selected %>%
  # Group and summarize to find the total population
  group_by(region, state) %>%
  summarize(total_pop = sum(population))

eg5. Notice the tibble is still grouped by region; use another summarize() step to calculate two new columns: the average state population in each region (average_pop) and the median state population in each region (median_pop).

counties_selected %>%
  # Group and summarize to find the total population
  group_by(region, state) %>%
  summarize(total_pop = sum(population)) %>%
  # Calculate the average_pop and median_pop columns 
  summarize(average_pop = mean(total_pop), median_pop = median(total_pop))

3. The top_n verb

counties_selected <- counties %>%
  select(region, state, county, metro, population, walk)

eg1. Find the county in each region with the highest percentage of citizens who walk to work.

counties_selected %>%
  # Group by region
  group_by(region) %>%
  # Find the greatest number of citizens who walk to work
  top_n(1,walk)

eg2. In how many states do more people live in metro areas than non-metro areas?

(Recall that the metro column has one of the two values "metro" (for high-density city areas) or "Nonmetro" (for suburban and country areas).)

counties_selected <- counties %>%
  select(state, metro, population)

counties_selected %>%
  # Find the total population for each combination of state and metro
  group_by(state, metro) %>%
  summarize(total_pop = sum(population)) %>%
  # Extract the most populated row for each state
  top_n(1, total_pop) %>%
  # Count the states with more people in Metro or Nonmetro areas
  ungroup() %>%
  count(metro)

Chapter 3

1. Selecting

(1) colon( : )

...
$ professional       <dbl> 33.2, 33.1, 26.8, 21.5, 28.5, 18.8, 27.5, 27.3, 23.~
$ service            <dbl> 17.0, 17.7, 16.1, 17.9, 14.1, 15.0, 16.6, 17.7, 14.~
$ office             <dbl> 24.2, 27.1, 23.1, 17.8, 23.9, 19.7, 21.9, 24.2, 26.~
$ construction       <dbl> 8.6, 10.8, 10.8, 19.0, 13.5, 20.1, 10.3, 10.5, 11.5~
$ production         <dbl> 17.1, 11.2, 23.1, 23.7, 19.9, 26.4, 23.7, 20.4, 24.~
...

eg. Select the columns for state, county, population and (using a colon) all five of those industry-related variables; there are five consecutive variables in the table related to the industry of people's work: professional, service, office, construction, and production.

counties %>%
  # Select state, county, population, and industry-related columns
  select(state, county, population, professional:production) %>%
  # Arrange service in descending order 
  arrange(desc(service))

(2) starts_with() / ends_with()

eg.

counties %>%
  # Select the state, county, population, and those ending with "work"
  select(state, county, population, ends_with("work")) %>%
  # Filter for counties that have at least 50% of people engaged in public work
  filter(public_work >= 50)

2. The rename verb

eg1.

counties %>%
  # Count the number of counties in each state
  count(state) %>%
  # Rename the n column to num_counties
  rename(num_counties = n)

eg2.

counties %>%
  # Select state, county, and poverty as poverty_rate
  select(state, county, poverty_rate = poverty)

3. The transmute verb

transmute: you can use to calculate new columns while dropping other columns

eg1. Keep only the state, county, and population columns, and add a new column, density, that contains the population per land_area.

counties %>%
  # Keep the state, county, and populations columns, and add a density column
  transmute(state, county, population, density = population / land_area)

eg2.

# Change the name of the unemployment column
counties %>%
  rename(unemployment_rate = unemployment)

# Keep the state and county columns, and the columns containing poverty
counties %>%
  select(state, county, contains("poverty"))

# Calculate the fraction_women column without dropping the other columns
counties %>%
  mutate(fraction_women = women / population)

# Keep only the state, county, and employment_rate columns
counties %>%
  transmute(state, county, employment_rate = employed / population)

Chapter 4

1. The babynames data

eg1. Filter for only the names Steven, Thomas, and Matthew, and assign it to an object called selected_names.

selected_names <- babynames %>%
  # Filter for the names Steven, Thomas, and Matthew 
  filter(name %in% c("Steven", "Thomas", "Matthew"))

eg2. Visualize those three names as a line plot over time, with each name represented by a different color.

# Plot the names using a different color for each name
ggplot(selected_names, aes(x = year, y = number, color = name)) +
  geom_line()

2. Grouped mutates

eg1. calculate the total number of people born in that year in this dataset as year_total.

# Calculate the fraction of people born each year with the same name
babynames %>%
  group_by(year) %>%
  mutate(year_total = sum(number)) %>%
  ungroup() %>%
  mutate(fraction = number / year_total)

eg2. Now use your newly calculated fraction column, in combination with top_n(), to identify the year each name is most common,.

# Calculate the fraction of people born each year with the same name
babynames %>%
  group_by(year) %>%
  mutate(year_total = sum(number)) %>%
  ungroup() %>%
  mutate(fraction = number / year_total) %>%
  # Find the year each name is most common
  group_by(name) %>%
  top_n(1, fraction)

eg3. Use a grouped mutate to add two columns: (1) name_total, with the sum of the number of babies born with that name in the entire dataset. (2) name_max, with the maximum number of babies born in any year.

babynames %>%
  # Add columns name_total and name_max for each name
  group_by(name) %>%
  mutate(name_total = sum(number), name_max = max(number))

eg4. (1) Add another step to ungroup the table. (2) Add a column called fraction_max containing the number in the year divided by name_max.

babynames %>%
  # Add columns name_total and name_max for each name
  group_by(name) %>%
  mutate(name_total = sum(number),
         name_max = max(number)) %>%
  # Ungroup the table 
  ungroup() %>%
  # Add the fraction_max column containing the number by the name maximum 
  mutate(fraction_max = number / name_max)

3. Window functions

v <- c(1, 3, 6, 14)
v
# 输出结果
[1] 1 3 6 14

lag(v)
# 输出结果
[1] NA 1 3 6

v - lag(v)
# 输出结果
[1] NA 2 3 8

eg. (1) Arrange the data in ascending order of name and then year. (2) Group by name so that your mutate works within each name. (3) Add a column ratio containing the ratio (not difference) of fraction between each year.

babynames_fraction %>%
  # Arrange the data in order of name, then year 
  arrange(name, year) %>%
  # Group the data by name
  group_by(name) %>%
  # Add a ratio column that contains the ratio of fraction between each year 
  mutate(ratio = fraction / lag(fraction))

r语言