library(tidyverse)
library(lubridate)
library(halfmoon)
library(ggridges)
library(zipcodeR)
Data manipulation
load("data/data.rds")
Focus only on requests made to NYPD.
<- data |>
data_nypd filter(agency == "NYPD")
Create a a new variable duration, which represents the time period from the Created Date to Closed Date. Note that duration may be censored for some requests.
<- data_nypd |>
data_nypd mutate(duration = difftime(closed_date, created_date) |>
as.numeric(),
duration_hours = difftime(closed_date, created_date, units = "hours") |>
as.numeric(),
duration_days = difftime(closed_date, created_date, units = "days") |>
as.numeric(),
weekday = weekdays(created_date),
weekend = ifelse(weekday %in% c("Saturday", "Sunday"), "yes", "no"))
Visualize the distribution of uncensored duration by weekdays/weekend and by borough, and test whether the distributions are the same across weekdays/weekends of their creation and across boroughs.
<- data_nypd |>
data_nypd_uncensored filter(!is.na(closed_date))
ggplot(data_nypd_uncensored, aes(x = duration, fill = weekend, group = weekend)) +
geom_mirror_histogram(bins = 30) +
scale_y_continuous(label = abs)
ggplot(data_nypd_uncensored, aes(x = duration, y = weekend, fill = weekend)) +
geom_density_ridges(bandwidth = 916) +
theme(legend.position = "none")
ggplot(data_nypd_uncensored, aes(x = duration, y = borough, fill = borough)) +
geom_density_ridges(bandwidth = 916) +
theme(legend.position = "none")
ks.test(duration ~ weekend, data = data_nypd_uncensored)
Warning in ks.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): p-value will be
approximate in the presence of ties
Asymptotic two-sample Kolmogorov-Smirnov test
data: duration by weekend
D = 0.028156, p-value = 0.001571
alternative hypothesis: two-sided
It appears that there is a significant difference in distribution of uncensored duration between weekdays and weekends (p = 0.001).
Merge the zipcode level information with the NYPD requests data.
## extract unique zipcodes
<- data_nypd |>
zips select(incident_zip) |>
filter(!is.na(incident_zip)) |>
distinct() |>
pull()
## create dataset of zipcode info using `zipcodeR` package
<- reverse_zipcode(zips)
data_zip
## investigate warning:
## Warning: No data found for ZIP code 10000
|>
data_nypd filter(incident_zip == 10000) |>
select(unique_key, street_name, cross_street_1, cross_street_2)
## The first two should be 10024
## The second two should be 10065
<- data.frame(
fix_zips unique_key = c(56531496, 56563071, 56581350, 56590107),
incident_zip_fix = c(10024, 10024, 10065, 10065),
needs_fix = 1
)
<- data_nypd |>
data_nypd left_join(fix_zips, by = "unique_key") |>
mutate(incident_zip = ifelse(!is.na(needs_fix),
incident_zip_fix,
incident_zip))
## extract unique zipcodes
<- data_nypd |>
zips select(incident_zip) |>
filter(!is.na(incident_zip)) |>
distinct() |>
pull()
## create dataset of zipcode info using `zipcodeR` package
<- reverse_zipcode(zips)
data_zip
## merge zipcode data in with full data set
<- data_nypd |>
data_nypd mutate(zipcode = as.character(incident_zip)) |>
left_join(data_zip, by = "zipcode")
data_nypd
# A tibble: 21,534 × 73
unique_key created_date closed_date agency agency_name
<dbl> <dttm> <dttm> <chr> <chr>
1 56524984 2023-01-15 00:00:18 2023-01-15 02:37:30 NYPD New York City Poli…
2 56526790 2023-01-15 00:00:21 2023-01-15 01:54:24 NYPD New York City Poli…
3 56525034 2023-01-15 00:00:50 2023-01-15 01:09:56 NYPD New York City Poli…
4 56526234 2023-01-15 00:01:02 2023-01-15 05:47:41 NYPD New York City Poli…
5 56530817 2023-01-15 00:01:07 2023-01-15 01:01:26 NYPD New York City Poli…
6 56530491 2023-01-15 00:01:12 2023-01-15 01:47:57 NYPD New York City Poli…
7 56527886 2023-01-15 00:01:22 2023-01-15 00:15:44 NYPD New York City Poli…
8 56530623 2023-01-15 00:01:23 2023-01-15 01:07:30 NYPD New York City Poli…
9 56528076 2023-01-15 00:01:34 2023-01-15 01:49:18 NYPD New York City Poli…
10 56527614 2023-01-15 00:01:59 2023-01-15 01:10:01 NYPD New York City Poli…
# ℹ 21,524 more rows
# ℹ 68 more variables: complaint_type <chr>, descriptor <chr>,
# location_type <chr>, incident_zip <dbl>, incident_address <chr>,
# street_name <chr>, cross_street_1 <chr>, cross_street_2 <chr>,
# intersection_street_1 <chr>, intersection_street_2 <chr>,
# address_type <chr>, city <chr>, landmark <chr>, facility_type <chr>,
# status <chr>, due_date <chr>, resolution_description <chr>, …
save(data_nypd, file = "data/data_nypd.rds")