library(tidyverse)
library(pointblank)
library(ggmap)
library(visdat)
Data cleaning
# data <- read_csv("https://github.com/statds/ids-s23/raw/8c68649925d069a1d93a71022b87b26a61b0c180/data/nyc311_011523-012123_by022023.csv")
# spec(data)
<- read_csv(
raw_data "https://github.com/statds/ids-s23/raw/8c68649925d069a1d93a71022b87b26a61b0c180/data/nyc311_011523-012123_by022023.csv",
col_types = cols(
`Unique Key` = col_double(),
`Created Date` =
col_datetime(format = "%m/%d/%Y %I:%M:%S %p"), # date
`Closed Date` =
col_datetime(format = "%m/%d/%Y %I:%M:%S %p"), #date
Agency = col_character(),
`Agency Name` = col_character(),
`Complaint Type` = col_character(),
Descriptor = col_character(),
`Location Type` = col_character(),
`Incident Zip` = col_double(),
`Incident Address` = col_character(),
`Street Name` = col_character(),
`Cross Street 1` = col_character(),
`Cross Street 2` = col_character(),
`Intersection Street 1` = col_character(),
`Intersection Street 2` = col_character(),
`Address Type` = col_character(),
City = col_character(),
Landmark = col_character(),
`Facility Type` = col_character(),
Status = col_character(),
`Due Date` = col_character(),
`Resolution Description` = col_character(),
`Resolution Action Updated Date` = col_character(),
`Community Board` = col_character(),
BBL = col_character(),
Borough = col_character(),
`X Coordinate (State Plane)` = col_double(),
`Y Coordinate (State Plane)` = col_double(),
`Open Data Channel Type` = col_character(),
`Park Facility Name` = col_character(),
`Park Borough` = col_character(),
`Vehicle Type` = col_character(),
`Taxi Company Borough` = col_character(),
`Taxi Pick Up Location` = col_character(), # changed to character
`Bridge Highway Name` = col_character(),
`Bridge Highway Direction` = col_character(),
`Road Ramp` = col_character(),
`Bridge Highway Segment` = col_character(),
Latitude = col_double(),
Longitude = col_double(),
Location = col_character()
))
Data cleaning
For ease of comparison across languages, make the column names consistent in style with lowercase using underscore to separate words within a name.
<- raw_data |>
data rename_with(~str_replace_all(., " ", "_") |> tolower(), everything())
Check for obvious errors or inefficiencies. For example, are there records whose Closed Date is earlier than or exactly the same as the Created Date? Are their invalid values for any columns? Are any columns redundant?
scan_data(data, sections = "OVM") |>
export_report("tbl_scan.html")
✔ The table scan has been written as `tbl_scan.html`
|>
data create_agent() |>
col_vals_not_null("unique_key") |>
rows_distinct("unique_key") |>
col_vals_lt(created_date, vars(closed_date), na_pass = TRUE) |>
col_vals_lte(created_date, vars(closed_date), na_pass = TRUE) |>
col_vals_between(latitude, -90, 90, na_pass = TRUE) |>
col_vals_between(longitude, -180, 180, na_pass = TRUE) |>
interrogate()
Pointblank Validation | |||||||||||||
[2023-11-02|15:53:16]
tibble
data
|
|||||||||||||
STEP | COLUMNS | VALUES | TBL | EVAL | UNITS | PASS | FAIL | W | S | N | EXT | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | col_vals_not_null()
|
— |
|
✓ |
54K |
54K 1.00 |
0 0.00 |
— |
— |
— |
— | ||
2 | rows_distinct()
|
— |
|
✓ |
54K |
54K 1.00 |
0 0.00 |
— |
— |
— |
— | ||
3 | col_vals_lt()
|
|
✓ |
54K |
53K 0.97 |
2K 0.03 |
— |
— |
— |
||||
4 | col_vals_lte()
|
|
✓ |
54K |
54K 0.99 |
134 0.01 |
— |
— |
— |
||||
5 | col_vals_between()
|
|
✓ |
54K |
54K 1.00 |
0 0.00 |
— |
— |
— |
— | |||
6 | col_vals_between()
|
|
✓ |
54K |
54K 1.00 |
0 0.00 |
— |
— |
— |
— | |||
2023-11-02 15:53:16 EDT < 1 s 2023-11-02 15:53:17 EDT |
Fill in missing values if possible. For example, if incident zip code is missing but the location is not, the zip code could be recovered by geocoding.
<- function(lat, lon) {
get_zipcode <- glue::glue("photon.komoot.io/reverse?lon={lon}&lat={lat}")
url <- httr::GET(url) |> httr::content()
o <- o$features[[1]]$properties$postcode |> as.numeric()
zipcode if (length(zipcode) == 0) {
return(NA)
}return(zipcode)
}
<- possibly(get_zipcode, otherwise = NA)
safe_zip <- data |>
fixed_zips filter(is.na(incident_zip) & !is.na(longitude) & !is.na(latitude)) |>
mutate(zip_fixed = map2_dbl(latitude, longitude, safe_zip)) |>
select(unique_key, zip_fixed)
save(fixed_zips, file = "data/fixed_zips.rds")
load("data/fixed_zips.rds")
<- data |>
data left_join(fixed_zips, by = "unique_key") |>
mutate(incident_zip = case_when(
is.na(incident_zip) ~ zip_fixed,
!is.na(incident_zip) ~ incident_zip
))
Summarize your suggestions to the data curator in several bullet points.
- Several of the columns have >90% missingness:
facility_type
due_date
vehicle_type
taxi_company_borough
taxi_pick_up_location
bridge_highway_name
bridge_highway_direction
road_ramp
bridge_highway_segment
- A small number of rows (134) have closed dates prior to created dates. These are all “Street Light Condition” complaints and will be removed.
<- data |>
data filter(!(created_date > closed_date))
save(data, file = "data/data.rds")