munging.Rmd

---
title: "Munging"
output: html_document
---

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more ---
title: "Temp"
output: html_document
---

```{r}

  require(data.table)
  #require(ggplot2)
  #require(doBy)
  require(stringr)

```


```{r}

  project.dir <- "."
  dataset.dir <- file.path(project.dir,"data_raw")
  
  # when done working, you can save your workspace as:
  # save.image(file = file.path(project.dir, dataset.dir, "image.RData"))
  
  # then, when you start working again, you can start back up with:
  # load(file.path(project.dir, dataset.dir, "image.RData"))

```

#### How to read

```{r}

  read.file <- function(path, year_num){
    dt_temp <- fread(path)
    dt_temp$year = year_num
    return(dt_temp)
  }
  
```


#### Read the file 
```{r, echo=FALSE}
  
  # first, download the original file:
  # read the file
  #top100.dt <- fread("https://data.cms.gov/api/views/97k6-zzx3/rows.csv?accessType=DOWNLOAD")
  dt.2011 <- read.file(file.path(dataset.dir, "Medicare_Provider_Charge_Inpatient_DRG100_FY2011.csv"), 2011)
  dt.2012 <- read.file(file.path(dataset.dir, "Medicare_Provider_Charge_Inpatient_DRG100_FY2012.csv"), 2012)
  dt.2013 <- read.file(file.path(dataset.dir, "Medicare_Provider_Charge_Inpatient_DRG100_FY2013.csv"), 2013)
  
  top100.dt <- rbind(dt.2011, dt.2012, dt.2013)
  
  setnames(top100.dt, names(top100.dt), gsub(" ", "", names(top100.dt)))
  
  rm(dt.2011, dt.2012, dt.2013)
```

#### Clean up

##### DRG Description column

Right now, there is a column called DRGDefinition where entries are of the form: "code - description"

Here is an example: "039 - EXTRACRANIAL PROCEDURES W/O CC/MCC".

We break this into two (factor) columns: drg.code and drg.description.

```{r}
  
  # these are empty rows (all columns are "" or NA)
  top100.dt <- top100.dt[which(!(top100.dt$DRGDefinition == "")),]
  
  # drg.code column:
  top100.dt$drg.code <- factor(word(top100.dt$DRGDefinition, 1))
  
  # drg.description column:
  top100.dt$drg.description <- factor(word(top100.dt$DRGDefinition, 3,-1))
  
  # get rid of redundant column:
  top100.dt[, DRGDefinition := NULL]
  
```

Change zip codes to factors. Add 0s at the beginning of zip codes where they were lost when stored as integers. (Yes, I tried reading them as factors in fread, the 0s were already gone.)

```{r}

  top100.dt[, ProviderZipCode := factor(
                      paste0(
                        ifelse(top100.dt$ProviderZipCode < 10000, "0", ""), 
                        top100.dt$ProviderZipCode
                      )
                    )]
  
```

Turn "money" columns into numbers without "$" signs:

```{r}

#  names_numeric = c("AverageMedicarePayments", "AverageTotalPayments", "AverageCoveredCharges")
#  for(col in names_numeric) set(top100.dt, j=col, value=as.numeric(gsub("$", "", top100.dt[[col]], fixed = TRUE)))
  
```

Turn character columns into factors where appropriate:

```{r}
  
  names_factors = c("ProviderId", "ProviderName", "ProviderStreetAddress", "ProviderCity", "ProviderState", "HospitalReferralRegion(HRR)Description")
  for (col in names_factors) set(top100.dt, j=col, value=as.factor(top100.dt[[col]]))
  
```


Change column names:

```{r}

  setnames(
    top100.dt, 
    names(top100.dt), 
    c("provider.id", "provider.name", "address", "city", "state", "zip", "region", "num.discharges", "covered.charges", "total.payments", "medicare.payments", "year", "drg.code", "drg.description")
  )
  
```

Taken from https://www.cms.gov/Research-Statistics-Data-and-Systems/Statistics-Trends-and-Reports/MedicareFeeforSvcPartsAB/downloads/DRGdesc08.pdf, this delimited file details the MS-DRG (diagnosis related groups) codes

```{r}
  msdrg.dt <- fread(file.path(dataset.dir,"ms-drg-codes.tdsv"),sep='~',
                    colClasses=('character'))
  msdrg.dt[,comp := ifelse(grepl('w/o M?CC',Title),'NONE',
                      ifelse(grepl('MCC',Title),'MCC','CC')) ]
  msdrg.dt[,Title := NULL]
  setnames(msdrg.dt,names(msdrg.dt),
           c('drg.code','drg.mdc','drg.type','drg.comp'))
  msdrg.dt[,drg.code := factor(drg.code)]
  top100.dt <- merge(top100.dt,msdrg.dt,by=c('drg.code'))
```

Census Data:
```{r}
  
  census.fname <- file.path(dataset.dir,"demographic_ZTAC_2010", "DEC_10_DP_DPDP1_with_ann.csv")
  census.cmd <- paste("awk 'NR == 1 || NR > 2'",census.fname)
  #first few lines to get column types
  census.dt <- fread(census.cmd, na.strings = c('(X)','( X )'),nrows=100)
  classes <- sapply(census.dt,class)
  #2nd column is GEO.id2, the zip code
  classes[2] <- "character"
  census.dt <- fread(census.cmd, na.strings = c('(X)','( X )'),colClasses=classes)
  census.dt[,GEO.id2 := as.factor(GEO.id2)]
  #HD01_S025,Number; SEX AND AGE - Total population - 65 years and over
  #HD01_S050,Number; SEX AND AGE - Male population - 65 years and over
  #HD01_S075,Number; SEX AND AGE - Female population - 65 years and over
  setnames(census.dt,c("GEO.id2",
                       "HD01_S025","HD01_S050","HD01_S075",
                       "HD02_S025","HD02_S050","HD02_S075"),
           c("zip","over.65.all","over.65.male","over.65.female",
             "over.65.all.pct","over.65.male.pct","over.65.female.pct"))
  census.dt[,grep('^(HD|GEO)',names(census.dt)) := NULL]
  top100.dt <- merge(top100.dt,census.dt,by=c('zip'),all.x=TRUE)
  
```


Add lat/lon columns for each address in the dataset.
```{r}

  # to add lat/lon columns

  top100.dt$temp.address <- paste(top100.dt$address, top100.dt$city, top100.dt$state, top100.dt$zip)
  
  assign_lat_lon <- function(data, add){
    latlon = geocode(add, source = "dsk")
    data[which(data$temp.address == add), lon:= latlon$lon]
    data[which(data$temp.address == add), lat:= latlon$lat]
  }

  # takes forever:
   lapply(unique(top100.dt$temp.address), function(x){assign_lat_lon(top100.dt, x)})

  # there were warnings that not all addresses were found. For the remainders, we turn to Google.
  assign_lat_lon <- function(data, add){
    latlon = geocode(add, source = "google")
    data[which(data$temp.address == add), lon:= latlon$lon]
    data[which(data$temp.address == add), lat:= latlon$lat]
  }

  lapply(unique(top100.dt[which(is.na(top100.dt$lon)),]$temp.address), function(x){assign_lat_lon(top100.dt, x)})

  # did we get everything? no. for the last two, I give a little help...
  # google didn't like the '#' signs in the addresses. I realized this after I completed the task...
  ### help # 1
  latlon = geocode("1 MEDICAL PARK DRIVE, BENTON AR, 72015", source = "google")
  top100.dt[which(top100.dt$temp.address == "#1 MEDICAL PARK DRIVE BENTON AR 72015"), lon:= latlon$lon]
  top100.dt[which(top100.dt$temp.address == "#1 MEDICAL PARK DRIVE BENTON AR 72015"), lat:= latlon$lat]
  ### help # 2
  latlon = geocode("3 EAST BENJAMIN DRIVE NEW MARTINSVILL WV 26155", source = "google")
  top100.dt[which(top100.dt$temp.address == "#3 EAST BENJAMIN DRIVE NEW MARTINSVILL WV 26155"), lon:= latlon$lon]
  top100.dt[which(top100.dt$temp.address == "#3 EAST BENJAMIN DRIVE NEW MARTINSVILL WV 26155"), lat:= latlon$lat]
 ### DONE!

  # get rid of temp column
  top100.dt[, temp.address := NULL]

```

Write a .csv:

```{r}
  out <- gzfile("./data/Top100Procedures.csv.gz","w")
  write.csv(top100.dt, file = out, row.names = FALSE)
  close(out)
```

To read the csv:

```{r}

  top100.dt <- fread("gunzip -c ./data/Top100Procedures.csv.gz")
  #top100.dt[,zip := as.factor(zip)]
  #this should be 474412 rows by 23 columns
  #dim(top100.dt)
  
```