Data cleaning

Author

Lucy D’Agostino McGowan

library(tidyverse)
library(pointblank)
library(ggmap)
library(visdat)

# data <- read_csv("https://github.com/statds/ids-s23/raw/8c68649925d069a1d93a71022b87b26a61b0c180/data/nyc311_011523-012123_by022023.csv")
# spec(data)
raw_data <- read_csv(
  "https://github.com/statds/ids-s23/raw/8c68649925d069a1d93a71022b87b26a61b0c180/data/nyc311_011523-012123_by022023.csv", 
  col_types = cols(
    `Unique Key` = col_double(),
    `Created Date` = 
      col_datetime(format = "%m/%d/%Y %I:%M:%S %p"), # date
    `Closed Date` = 
      col_datetime(format = "%m/%d/%Y %I:%M:%S %p"), #date
    Agency = col_character(),
    `Agency Name` = col_character(),
    `Complaint Type` = col_character(),
    Descriptor = col_character(),
    `Location Type` = col_character(),
    `Incident Zip` = col_double(),
    `Incident Address` = col_character(),
    `Street Name` = col_character(),
    `Cross Street 1` = col_character(),
    `Cross Street 2` = col_character(),
    `Intersection Street 1` = col_character(),
    `Intersection Street 2` = col_character(),
    `Address Type` = col_character(),
    City = col_character(),
    Landmark = col_character(),
    `Facility Type` = col_character(),
    Status = col_character(),
    `Due Date` = col_character(),
    `Resolution Description` = col_character(),
    `Resolution Action Updated Date` = col_character(),
    `Community Board` = col_character(),
    BBL = col_character(),
    Borough = col_character(),
    `X Coordinate (State Plane)` = col_double(),
    `Y Coordinate (State Plane)` = col_double(),
    `Open Data Channel Type` = col_character(),
    `Park Facility Name` = col_character(),
    `Park Borough` = col_character(),
    `Vehicle Type` = col_character(),
    `Taxi Company Borough` = col_character(),
    `Taxi Pick Up Location` = col_character(), # changed to character
    `Bridge Highway Name` = col_character(),
    `Bridge Highway Direction` = col_character(),
    `Road Ramp` = col_character(),
    `Bridge Highway Segment` = col_character(),
    Latitude = col_double(),
    Longitude = col_double(),
    Location = col_character()
  ))

Data cleaning

For ease of comparison across languages, make the column names consistent in style with lowercase using underscore to separate words within a name.

data <- raw_data |>
  rename_with(~str_replace_all(., " ", "_") |> tolower(), everything())

Check for obvious errors or inefficiencies. For example, are there records whose Closed Date is earlier than or exactly the same as the Created Date? Are their invalid values for any columns? Are any columns redundant?

scan_data(data, sections = "OVM") |>
  export_report("tbl_scan.html")

✔ The table scan has been written as `tbl_scan.html`

data |>
  create_agent() |>
  col_vals_not_null("unique_key") |>
  rows_distinct("unique_key") |>
  col_vals_lt(created_date, vars(closed_date), na_pass = TRUE) |>
  col_vals_lte(created_date, vars(closed_date), na_pass = TRUE) |>
  col_vals_between(latitude, -90, 90, na_pass = TRUE) |>
  col_vals_between(longitude, -180, 180, na_pass = TRUE) |>
  interrogate()

		STEP	COLUMNS	VALUES	EVAL	UNITS	PASS	FAIL	W	S	N	EXT
Pointblank Validation
[2023-11-02\|15:53:16] tibble data
	1	`col_vals_not_null()`	`▮unique_key`	—	✓	`54K`	`54K` `1.00`	`0` `0.00`	—	—	—	—
	2	`rows_distinct()`	`▮unique_key`	—	✓	`54K`	`54K` `1.00`	`0` `0.00`	—	—	—	—
	3	`col_vals_lt()`	`▮created_date`	`▮closed_date`	✓	`54K`	`53K` `0.97`	`2K` `0.03`	—	—	—
	4	`col_vals_lte()`	`▮created_date`	`▮closed_date`	✓	`54K`	`54K` `0.99`	`134` `0.01`	—	—	—
	5	`col_vals_between()`	`▮latitude`	`[−90, 90]`	✓	`54K`	`54K` `1.00`	`0` `0.00`	—	—	—	—
	6	`col_vals_between()`	`▮longitude`	`[−180, 180]`	✓	`54K`	`54K` `1.00`	`0` `0.00`	—	—	—	—
2023-11-02 15:53:16 EDT < 1 s 2023-11-02 15:53:17 EDT

Fill in missing values if possible. For example, if incident zip code is missing but the location is not, the zip code could be recovered by geocoding.

get_zipcode <- function(lat, lon) {
  url <- glue::glue("photon.komoot.io/reverse?lon={lon}&lat={lat}")
  o <- httr::GET(url) |> httr::content()
  zipcode <- o$features[[1]]$properties$postcode |> as.numeric()
  if (length(zipcode) == 0) {
    return(NA)
  }
  return(zipcode)
}

safe_zip <- possibly(get_zipcode, otherwise = NA)
fixed_zips <- data |>
  filter(is.na(incident_zip) & !is.na(longitude) & !is.na(latitude)) |>
  mutate(zip_fixed = map2_dbl(latitude, longitude, safe_zip)) |>
  select(unique_key, zip_fixed)

save(fixed_zips, file = "data/fixed_zips.rds")

load("data/fixed_zips.rds")
data <- data |>
  left_join(fixed_zips, by = "unique_key") |>
  mutate(incident_zip = case_when(
    is.na(incident_zip) ~ zip_fixed,
    !is.na(incident_zip) ~ incident_zip
  ))

Summarize your suggestions to the data curator in several bullet points.

Several of the columns have >90% missingness:
- facility_type
- due_date
- vehicle_type
- taxi_company_borough
- taxi_pick_up_location
- bridge_highway_name
- bridge_highway_direction
- road_ramp
- bridge_highway_segment
A small number of rows (134) have closed dates prior to created dates. These are all “Street Light Condition” complaints and will be removed.

data <- data |>
  filter(!(created_date > closed_date))

save(data, file = "data/data.rds")