Data manipulation

Author

Lucy D’Agostino McGowan

library(tidyverse)
library(lubridate)
library(halfmoon)
library(ggridges)
library(zipcodeR)

load("data/data.rds")

Focus only on requests made to NYPD.

data_nypd <- data |>
  filter(agency == "NYPD")

Create a a new variable duration, which represents the time period from the Created Date to Closed Date. Note that duration may be censored for some requests.

data_nypd <- data_nypd |>
  mutate(duration = difftime(closed_date, created_date) |> 
           as.numeric(),
         duration_hours = difftime(closed_date, created_date, units = "hours") |> 
           as.numeric(),
         duration_days = difftime(closed_date, created_date, units = "days") |> 
           as.numeric(),
         weekday = weekdays(created_date),
         weekend = ifelse(weekday %in% c("Saturday", "Sunday"), "yes", "no"))

Visualize the distribution of uncensored duration by weekdays/weekend and by borough, and test whether the distributions are the same across weekdays/weekends of their creation and across boroughs.

data_nypd_uncensored <- data_nypd |>
  filter(!is.na(closed_date))
ggplot(data_nypd_uncensored, aes(x = duration, fill = weekend, group = weekend)) + 
  geom_mirror_histogram(bins = 30) + 
  scale_y_continuous(label = abs)

ggplot(data_nypd_uncensored, aes(x = duration, y = weekend, fill = weekend)) + 
  geom_density_ridges(bandwidth = 916) + 
  theme(legend.position = "none")

ggplot(data_nypd_uncensored, aes(x = duration, y = borough, fill = borough)) + 
  geom_density_ridges(bandwidth = 916) + 
  theme(legend.position = "none")

ks.test(duration ~ weekend, data = data_nypd_uncensored)

Warning in ks.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): p-value will be
approximate in the presence of ties


    Asymptotic two-sample Kolmogorov-Smirnov test

data:  duration by weekend
D = 0.028156, p-value = 0.001571
alternative hypothesis: two-sided

It appears that there is a significant difference in distribution of uncensored duration between weekdays and weekends (p = 0.001).

Merge the zipcode level information with the NYPD requests data.

## extract unique zipcodes
zips <- data_nypd |>
  select(incident_zip) |>
  filter(!is.na(incident_zip)) |>
  distinct() |>
  pull()

## create dataset of zipcode info using `zipcodeR` package

data_zip <- reverse_zipcode(zips)

## investigate warning: 
## Warning: No data found for ZIP code 10000

data_nypd |>
  filter(incident_zip == 10000) |>
  select(unique_key, street_name, cross_street_1, cross_street_2)

## The first two should be 10024
## The second two should be 10065

fix_zips <- data.frame(
  unique_key = c(56531496, 56563071, 56581350, 56590107),
  incident_zip_fix = c(10024, 10024, 10065, 10065),
  needs_fix = 1
)

data_nypd <- data_nypd |>
  left_join(fix_zips, by = "unique_key") |>
  mutate(incident_zip = ifelse(!is.na(needs_fix),
                               incident_zip_fix,
                               incident_zip))

## extract unique zipcodes
zips <- data_nypd |>
  select(incident_zip) |>
  filter(!is.na(incident_zip)) |>
  distinct() |>
  pull()

## create dataset of zipcode info using `zipcodeR` package

data_zip <- reverse_zipcode(zips)

## merge zipcode data in with full data set

data_nypd <- data_nypd |>
  mutate(zipcode = as.character(incident_zip)) |>
  left_join(data_zip, by = "zipcode")

data_nypd

# A tibble: 21,534 × 73
   unique_key created_date        closed_date         agency agency_name        
        <dbl> <dttm>              <dttm>              <chr>  <chr>              
 1   56524984 2023-01-15 00:00:18 2023-01-15 02:37:30 NYPD   New York City Poli…
 2   56526790 2023-01-15 00:00:21 2023-01-15 01:54:24 NYPD   New York City Poli…
 3   56525034 2023-01-15 00:00:50 2023-01-15 01:09:56 NYPD   New York City Poli…
 4   56526234 2023-01-15 00:01:02 2023-01-15 05:47:41 NYPD   New York City Poli…
 5   56530817 2023-01-15 00:01:07 2023-01-15 01:01:26 NYPD   New York City Poli…
 6   56530491 2023-01-15 00:01:12 2023-01-15 01:47:57 NYPD   New York City Poli…
 7   56527886 2023-01-15 00:01:22 2023-01-15 00:15:44 NYPD   New York City Poli…
 8   56530623 2023-01-15 00:01:23 2023-01-15 01:07:30 NYPD   New York City Poli…
 9   56528076 2023-01-15 00:01:34 2023-01-15 01:49:18 NYPD   New York City Poli…
10   56527614 2023-01-15 00:01:59 2023-01-15 01:10:01 NYPD   New York City Poli…
# ℹ 21,524 more rows
# ℹ 68 more variables: complaint_type <chr>, descriptor <chr>,
#   location_type <chr>, incident_zip <dbl>, incident_address <chr>,
#   street_name <chr>, cross_street_1 <chr>, cross_street_2 <chr>,
#   intersection_street_1 <chr>, intersection_street_2 <chr>,
#   address_type <chr>, city <chr>, landmark <chr>, facility_type <chr>,
#   status <chr>, due_date <chr>, resolution_description <chr>, …

save(data_nypd, file = "data/data_nypd.rds")