Data cleaning

Author

Lucy D’Agostino McGowan

library(tidyverse)
library(pointblank)
library(ggmap)
library(visdat)
# data <- read_csv("https://github.com/statds/ids-s23/raw/8c68649925d069a1d93a71022b87b26a61b0c180/data/nyc311_011523-012123_by022023.csv")
# spec(data)
raw_data <- read_csv(
  "https://github.com/statds/ids-s23/raw/8c68649925d069a1d93a71022b87b26a61b0c180/data/nyc311_011523-012123_by022023.csv", 
  col_types = cols(
    `Unique Key` = col_double(),
    `Created Date` = 
      col_datetime(format = "%m/%d/%Y %I:%M:%S %p"), # date
    `Closed Date` = 
      col_datetime(format = "%m/%d/%Y %I:%M:%S %p"), #date
    Agency = col_character(),
    `Agency Name` = col_character(),
    `Complaint Type` = col_character(),
    Descriptor = col_character(),
    `Location Type` = col_character(),
    `Incident Zip` = col_double(),
    `Incident Address` = col_character(),
    `Street Name` = col_character(),
    `Cross Street 1` = col_character(),
    `Cross Street 2` = col_character(),
    `Intersection Street 1` = col_character(),
    `Intersection Street 2` = col_character(),
    `Address Type` = col_character(),
    City = col_character(),
    Landmark = col_character(),
    `Facility Type` = col_character(),
    Status = col_character(),
    `Due Date` = col_character(),
    `Resolution Description` = col_character(),
    `Resolution Action Updated Date` = col_character(),
    `Community Board` = col_character(),
    BBL = col_character(),
    Borough = col_character(),
    `X Coordinate (State Plane)` = col_double(),
    `Y Coordinate (State Plane)` = col_double(),
    `Open Data Channel Type` = col_character(),
    `Park Facility Name` = col_character(),
    `Park Borough` = col_character(),
    `Vehicle Type` = col_character(),
    `Taxi Company Borough` = col_character(),
    `Taxi Pick Up Location` = col_character(), # changed to character
    `Bridge Highway Name` = col_character(),
    `Bridge Highway Direction` = col_character(),
    `Road Ramp` = col_character(),
    `Bridge Highway Segment` = col_character(),
    Latitude = col_double(),
    Longitude = col_double(),
    Location = col_character()
  ))

Data cleaning

For ease of comparison across languages, make the column names consistent in style with lowercase using underscore to separate words within a name.

data <- raw_data |>
  rename_with(~str_replace_all(., " ", "_") |> tolower(), everything())

Check for obvious errors or inefficiencies. For example, are there records whose Closed Date is earlier than or exactly the same as the Created Date? Are their invalid values for any columns? Are any columns redundant?

scan_data(data, sections = "OVM") |>
  export_report("tbl_scan.html")
✔ The table scan has been written as `tbl_scan.html`
data |>
  create_agent() |>
  col_vals_not_null("unique_key") |>
  rows_distinct("unique_key") |>
  col_vals_lt(created_date, vars(closed_date), na_pass = TRUE) |>
  col_vals_lte(created_date, vars(closed_date), na_pass = TRUE) |>
  col_vals_between(latitude, -90, 90, na_pass = TRUE) |>
  col_vals_between(longitude, -180, 180, na_pass = TRUE) |>
  interrogate()
Pointblank Validation
[2023-11-02|15:53:16]

tibble data
STEP COLUMNS VALUES TBL EVAL UNITS PASS FAIL W S N EXT
1
col_vals_not_null
 col_vals_not_null()

unique_key

54K 54K
1.00
0
0.00

2
rows_distinct
 rows_distinct()

unique_key

54K 54K
1.00
0
0.00

3
col_vals_lt
 col_vals_lt()

created_date

closed_date

54K 53K
0.97
2K
0.03

4
col_vals_lte
 col_vals_lte()

created_date

closed_date

54K 54K
0.99
134
0.01

5
col_vals_between
 col_vals_between()

latitude

[−90, 90]

54K 54K
1.00
0
0.00

6
col_vals_between
 col_vals_between()

longitude

[−180, 180]

54K 54K
1.00
0
0.00

2023-11-02 15:53:16 EDT < 1 s 2023-11-02 15:53:17 EDT

Fill in missing values if possible. For example, if incident zip code is missing but the location is not, the zip code could be recovered by geocoding.

get_zipcode <- function(lat, lon) {
  url <- glue::glue("photon.komoot.io/reverse?lon={lon}&lat={lat}")
  o <- httr::GET(url) |> httr::content()
  zipcode <- o$features[[1]]$properties$postcode |> as.numeric()
  if (length(zipcode) == 0) {
    return(NA)
  }
  return(zipcode)
}

safe_zip <- possibly(get_zipcode, otherwise = NA)
fixed_zips <- data |>
  filter(is.na(incident_zip) & !is.na(longitude) & !is.na(latitude)) |>
  mutate(zip_fixed = map2_dbl(latitude, longitude, safe_zip)) |>
  select(unique_key, zip_fixed)

save(fixed_zips, file = "data/fixed_zips.rds")
load("data/fixed_zips.rds")
data <- data |>
  left_join(fixed_zips, by = "unique_key") |>
  mutate(incident_zip = case_when(
    is.na(incident_zip) ~ zip_fixed,
    !is.na(incident_zip) ~ incident_zip
  ))

Summarize your suggestions to the data curator in several bullet points.

  • Several of the columns have >90% missingness:
    • facility_type
    • due_date
    • vehicle_type
    • taxi_company_borough
    • taxi_pick_up_location
    • bridge_highway_name
    • bridge_highway_direction
    • road_ramp
    • bridge_highway_segment
  • A small number of rows (134) have closed dates prior to created dates. These are all “Street Light Condition” complaints and will be removed.
data <- data |>
  filter(!(created_date > closed_date))
save(data, file = "data/data.rds")