.Rhistory

yr <- condence(bin(yrBuilt,1))
yr <- condense(bin(yrBuilt,1))
autoplot(yr)
yr <- condense(bin(yrBuilt,2))
autoplot(yr)
yr <- condense(bin(yrBuilt,5))
autoplot(yr)
yr <- condense(bin(yrBuilt,2))
yr
numFloor <- pdata$NumFloors[pdata$YearBuilt > 1850 & pdata$AssessTot > 10000 & pdata$AssessTot < 10000000 & pdata$NumFloors != 0 ]
yrBuilt  <- pdata$YearBuilt[pdata$YearBuilt > 1850 & pdata$AssessTot > 10000 & pdata$AssessTot < 10000000 & pdata$NumFloors != 0 ]
flrVsYr <- condense(bin(yrBuilt, 3), bin(numFloor, 1))
flrVsYr
autoplot(flrVsYr)
autoplot(flrVsYr) + theme(panel.background=element_rect(fill='white')) + ylim(0,50)
p <- autoplot(flrVsYr) + theme(panel.background=element_rect(fill='white')) + ylim(0,50)
p + scale_fill_gradient(limits= c(1,100000),
low='grey',
high='blue',
trans="log",
breaks=myBreaks)
flrVsYr <- condense(bin(yrBuilt, 1), bin(numFloor, 1))
p <- autoplot(flrVsYr) + theme(panel.background=element_rect(fill='white')) + ylim(0,50)
p + scale_fill_gradient(limits= c(1,100000),
low='grey',
high='blue',
trans="log",
breaks=myBreaks)
p <- autoplot(assessVsYr) + theme(panel.background=element_rect(fill='white')) + xlim(1850,2015)
p + scale_fill_gradient(limits= c(1,100000),
low='grey',
high='blue',
trans="log",
breaks=myBreaks)
valPerFloor <- assessTot/numFloor
assessTot <- pdata$AssessTot[pdata$YearBuilt > 1850 & pdata$AssessTot > 10000 & pdata$AssessTot < 10000000 & pdata$NumFloors != 0]
valPerFloor <- assessTot/numFloor
ValPerYr <- condense(bin(yrBuilt,1), bin(valPerFloor, 1))
tail(ValPerYr)
ValPerYr <- condense(bin(yrBuilt,1), bin(valPerFloor, 50000))
tail(ValPerYr)
p <- autoplot(ValPerYr) + theme(panel.background=element_rect(fill='white')) + ylim(0,50)
p + scale_fill_gradient(limits= c(1,100000),
low='grey',
high='blue',
trans="log",
breaks=myBreaks)
p <- autoplot(ValPerYr) + theme(panel.background=element_rect(fill='white'))
p + scale_fill_gradient(limits= c(1,100000),
low='grey',
high='blue',
trans="log",
breaks=myBreaks)
p <- autoplot(ValPerYr) + theme(panel.background=element_rect(fill='white')) + xlim(1945,2014)
p + scale_fill_gradient(limits= c(1,100000),
low='grey',
high='blue',
trans="log",
breaks=myBreaks)
p <- autoplot(ValPerYr) + theme(panel.background=element_rect(fill='white')) + xlim(1930,1960)
p + scale_fill_gradient(limits= c(1,100000),
low='grey',
high='blue',
trans="log",
breaks=myBreaks)
fix(assessVsYr)
source('~/Dropbox/CUNY/my_solutions/lect2.r')
lotArea  <-pdata$LotArea[pdata$YearBuilt > 1850 & pdata$LotArea > 100 & pdata$AssessTot < 10000000 & pdata$NumFloors != 0 ]
yrBuilt  <- pdata$YearBuilt[pdata$YearBuilt > 1850 & pdata$LotArea > 100 & pdata$AssessTot < 10000000 & pdata$NumFloors != 0  ]
areaVsYr <- condense(bin(yrBuilt, 1), bin(lotArea, 1000))
tail(areaVsYr)
myBreaks <- c(100000, 10000, 1000, 100, 10, 1)
p <- autoplot(areaVsYr) + element_rect(fill=NULL)
p + scale_fill_gradient(limits= c(1,1000000),
low='grey',
high='blue',
trans="log",
breaks=myBreaks)
p <- autoplot(areaVsYr) + theme(panel.background=element_rect(fill='white'))
p + scale_fill_gradient(limits= c(1,1000000),
low='grey',
high='blue',
trans="log",
breaks=myBreaks)
p <- autoplot(areaVsYr) + theme(panel.background=element_rect(fill='white'))  + ylim(0, 100000)
p + scale_fill_gradient(limits= c(1,1000000),
low='grey',
high='blue',
trans="log",
breaks=myBreaks)
p <- autoplot(areaVsYr) + theme(panel.background=element_rect(fill='white'))  + ylim(0, 200000)
p + scale_fill_gradient(limits= c(1,1000000),
low='grey',
high='blue',
trans="log",
breaks=myBreaks)
yr <- condense(bin(yrBuilt, 1), z=lotArea)
autoplot(yr)
autoplot(yr) + xlim(1900, 2014)
autoplot(yr) + xlim(1900, 2014) + ylim(0, 10000)
autoplot(yr) + xlim(1900, 2014) + ylim(0, 50000)
yr <- condense(bin(yrBuilt, 3), z=lotArea)
autoplot(yr) + xlim(1900, 2014) + ylim(0, 50000)
yr <- condense(bin(yrBuilt, 5), z=lotArea)
autoplot(yr) + xlim(1900, 2014) + ylim(0, 50000)
autoplot(yr) + xlim(1900, 2014) + ylim(0, 10000)
autoplot(yr) + xlim(1900, 2014) + ylim(0, 10000) = ylab('lotArea')
autoplot(yr) + xlim(1900, 2014) + ylim(0, 10000) + ylab('Lot Area')
p <- autoplot(areaVsYr) + theme(panel.background=element_rect(fill='white'))  + ylim(0, 200000)
p + scale_fill_gradient(limits= c(1,1000000),
low='grey',
high='blue',
trans="log",
breaks=myBreaks)
p + scale_fill_gradient(limits= c(1,100000),
low='grey',
high='blue',
trans="log",
breaks=myBreaks)
areaVsYr <- condense(bin(yrBuilt, 2), bin(lotArea, 1000))
myBreaks <- c(100000, 10000, 1000, 100, 10, 1)
p <- autoplot(areaVsYr) + theme(panel.background=element_rect(fill='white'))  + ylim(0, 200000)
p + scale_fill_gradient(limits= c(1,100000),
low='grey',
high='blue',
trans="log",
breaks=myBreaks)
library("bigvis")
library("ggplot2")
lotArea  <-pdata$LotArea[pdata$YearBuilt > 1850 & pdata$LotArea > 100 & pdata$AssessTot < 10000000 & pdata$NumFloors != 0 ]
pData <- read.csv("~/Dropbox/CUNY/datasets/all_PLUTO_data.R")
pData <- read.csv("/Users/JL/Dropbox/CUNY/datasets/all_PLUTO_data.R")
pData <- read.csv("/Users/JL/Dropbox/CUNY/datasets/all_PLUTO_data.csv")
lotArea  <-pdata$LotArea[pdata$YearBuilt > 1850 & pdata$LotArea > 100 & pdata$AssessTot < 10000000 & pdata$NumFloors != 0 ]
lotArea  <-pData$LotArea[pData$YearBuilt > 1850 & pData$LotArea > 100 & pData$AssessTot < 10000000 & pData$NumFloors != 0 ]
yrBuilt  <- pData$YearBuilt[pData$YearBuilt > 1850 & pData$LotArea > 100 & pData$AssessTot < 10000000 & pData$NumFloors != 0  ]
yr <- condense(bin(yrBuilt, 5), z=lotArea)
autoplot(yr) + xlim(1900, 2014) + ylim(0, 10000) + ylab('Lot Area')
yr <- condense(bin(yrBuilt, 10), z=lotArea)
autoplot(yr) + xlim(1900, 2014) + ylim(0, 10000) + ylab('Lot Area')
yr <- condense(bin(yrBuilt, 1), z=lotArea)
autoplot(yr) + xlim(1900, 2014) + ylim(0, 10000) + ylab('Lot Area')
rm(all.purch)
rm(blu)
q()
~/.RData
library("bigvis")
library("ggplot2")
All_dataBKBXMNQNSI <- read.csv('/Users/JL/Dropbox/CUNY/datasets/all_PLUTO_data.csv')
All_dataBKBXMNQNSI$AssValperfl<-All_dataBKBXMNQNSI$AssessTot/All_dataBKBXMNQNSI$NumFloors
ggplot(All_dataBKBXMNQNSI[subset, c("NumFloors","YearBuilt")  ], aes(y=(NumFloors), x =YearBuilt)) + geom_point(shape=1)
library("ggplot2")
ggplot(All_dataBKBXMNQNSI[subset, c("NumFloors","YearBuilt")  ], aes(y=(NumFloors), x =YearBuilt)) + geom_point(shape=1)
subset <- which(All_dataBKBXMNQNSI$YearBuilt > 1850 &
All_dataBKBXMNQNSI$LotArea > 100 &
All_dataBKBXMNQNSI$AssessTot < 10000000 &
All_dataBKBXMNQNSI$NumFloors != 0 )
ggplot(All_dataBKBXMNQNSI[subset, ], aes(x=YearBuilt)) + geom_histogram(binwidth=5)
ggplot(All_dataBKBXMNQNSI[subset, c("NumFloors","YearBuilt")  ], aes(y=(NumFloors), x =YearBuilt)) + geom_point(shape=1)
pData <- read.csv('/Users/JL/Dropbox/CUNY/datasets/all_PLUTO_data.csv')
?smooth
library(ggplot2)
library(bigvis)
library(plyr)
?smooth
pData <- pData[pData$YearBuilt >= 1850, ]
# Convert to factors
pData$Borough <- as.factor(pData$Borough)
pData$LtdHeight <- as.factor(pData$LtdHeight)
pData$LandUse <- as.factor(pData$LandUse)
pData$BuiltCode <- as.factor(pData$BuiltCode)
# Remove data when numfloors & numbuildings == 0
pData <- pData[!(pData$NumBldgs == 0 & pData$NumFloors == 0), ]
# Get rid of lots with a really small area
pData <- pData[pData$LotArea > 100, ]
# There are two assessments in the data, one for total and one for land.
# Since we are interested in building construction, it makes sense to look
# only at the value of the building, which is total assessment - land assessment.
# Add that column to the data as AssessBldg.
pData$AssessBldg <- pData$AssessTot - pData$AssessLand
pData <- pData[pData$AssessBldg > 0 & pData$BldgArea > 0, ]
pData$AssessSqft <- pData$AssessBldg / pData$BldgArea
# To compute cost per floor, will first filter out buildings with 0 floors
pData <- pData[pData$NumFloors != 0, ]
# Now add a column for cost per floor
pData$ValuePerFloor <- pData$AssessBldg / pData$NumFloors
# Figure 1
ggplot(pData, aes(YearBuilt, fill=BuiltCode)) +
geom_bar(stat="bin", binwidth = 10) +
scale_x_continuous(breaks=seq(1850, 2025, 10))  +
geom_vline(x=seq(1850, 2025, 10), color="white") +
theme(panel.grid.minor.x=element_blank()) +
ggtitle("Construction activity per decade") +
xlab("Year") + ylab("Number of buildings") +
scale_fill_discrete(name = "Record Type",
breaks=c("", "E"),
labels=c("Known", "Estimated"))
# Figure 2
floors <- condense(bin(pData$YearBuilt, 10), bin(pData$NumFloors, 2))
autoplot(floors) + theme_bw() +
scale_fill_gradient("Number Built",
trans = 'log',
# limits = c(1, 1000),
low="lightgray", high="black") +
scale_y_continuous(breaks=seq(0, 130, 10)) +
scale_x_continuous(breaks=seq(1850, 2020, 10)) +
xlab("Year Built") +
ylab("Number of floors") +
ggtitle("Height distribution of buildings.") +
geom_hline(y=c(20, 30, 40), color="blue")
# Figure 3
bin.year <- 1
sf <- bin.year * 10 # smoothing factor
sf
?smooth
?paste0
stype
value <- condense(bin(pData$YearBuilt, bin.year),
z=pData$ValuePerFloor,
summary=stype)
stype = "median" # condense summary. set to [mean, median]
smethod = "robust_regression" # set to [mean, regression, robust_regression]
value <- condense(bin(pData$YearBuilt, bin.year),
z=pData$ValuePerFloor,
summary=stype)
value.s <- smooth(value, sf, var=paste0(".", stype), type=smethod)
?smooth
library("bigvis")
library("ggplot2")
setwd('/Users/JL/Box Sync/CUNY/CUNY_IS608/lecture2')
pData <- read.csv("data/some_PLUTO_data.csv")
pData <- head(pData,1000)
lotArea  <-pData$LotArea[pData$YearBuilt > 1850 & pData$LotArea > 100 & pData$AssessTot < 10000000 & pData$NumFloors != 0 ]
yrBuilt  <- pData$YearBuilt[pData$YearBuilt > 1850 & pData$LotArea > 100 & pData$AssessTot < 10000000 & pData$NumFloors != 0  ]
# let's plot 5 year averages)
yr <- condense(bin(yrBuilt, 5), z=lotArea)
autoplot(yr) + xlim(1900, 2014) + ylim(0, 10000) + ylab('Lot Area')
pData <- read.csv("data/some_PLUTO_data.csv")
pData <- head(pData,10000)
# does lot area change with year of construction?
lotArea  <-pData$LotArea[pData$YearBuilt > 1850 & pData$LotArea > 100 & pData$AssessTot < 10000000 & pData$NumFloors != 0 ]
yrBuilt  <- pData$YearBuilt[pData$YearBuilt > 1850 & pData$LotArea > 100 & pData$AssessTot < 10000000 & pData$NumFloors != 0  ]
# let's plot 5 year averages)
yr <- condense(bin(yrBuilt, 5), z=lotArea)
autoplot(yr) + xlim(1900, 2014) + ylim(0, 10000) + ylab('Lot Area')
pData <- read.csv("data/some_PLUTO_data.csv")
# does lot area change with year of construction?
lotArea  <-pData$LotArea[pData$YearBuilt > 1850 & pData$LotArea > 100 & pData$AssessTot < 10000000 & pData$NumFloors != 0 ]
yrBuilt  <- pData$YearBuilt[pData$YearBuilt > 1850 & pData$LotArea > 100 & pData$AssessTot < 10000000 & pData$NumFloors != 0  ]
# let's plot 5 year averages)
yr <- condense(bin(yrBuilt, 5), z=lotArea)
autoplot(yr) + xlim(1900, 2014) + ylim(0, 10000) + ylab('Lot Area')
library(ggplot2)
library(plyr)
setwd("/Users/JL/Box Sync/CUNY/CUNY_IS608")
# load data
inc <- read.csv("lecture1/data/inc5000_data.csv", header= TRUE)
# conduct a quick data quality investigation
head(inc)
summary(inc)
summary(inc[,c(3:6,8)])
# see we have NA's, will exclude
all_inc <- inc[complete.cases(inc)==TRUE,]
# aggregate by state
cnt <- ddply(all_inc, .(State), summarize, cnt = length(State))
p <- ggplot(cnt, aes(x=State, y=cnt)) + geom_bar(stat='identity')
p
# flip coords
p <- ggplot(cnt, aes(x=State, y=cnt)) + geom_bar(stat='identity')
p + coord_flip()
# look at the state with the 3rd most companies on the list.
# Let's look at their companies by Industry
# head(ny)
ny <- subset(all_inc, State == 'NY')
p <- ggplot(ny, aes(x=Industry, y=Employees)) + geom_point()
p + coord_flip()
# uh oh, we have an outlier that makes this analysis difficult
# compute lower and upper whiskers
winsor <- function(x, bot, top)  { return(min(top, max(x, bot))) }
ny$clip_employ <- sapply(ny$Employees, winsor, bot=0, top =2500)
p3 <- ggplot(ny, aes(x=Industry, y=clip_employ))
p3 + geom_point() + coord_flip()
head(inc)
summary(inc)
summary(inc[,c(3:6,8)])
all_inc <- inc[complete.cases(inc)==TRUE,]
library("bigvis")
library("ggplot2")
setwd('/Users/JL/Box Sync/CUNY/CUNY_IS608/lecture2')
pData <- read.csv("data/some_PLUTO_data.csv")
head(pData)
lotArea  <-pData$LotArea[pData$YearBuilt > 1850 & pData$LotArea > 100 & pData$AssessTot < 10000000 & pData$NumFloors != 0 ]
yrBuilt  <- pData$YearBuilt[pData$YearBuilt > 1850 & pData$LotArea > 100 & pData$AssessTot < 10000000 & pData$NumFloors != 0  ]
yr <- condense(bin(yrBuilt, 5), z=lotArea)
autoplot(yr) + xlim(1900, 2014) + ylim(0, 10000) + ylab('Lot Area')
yr <- condense(bin(yrBuilt, 10), z=lotArea)
autoplot(yr) + xlim(1900, 2014) + ylim(0, 10000) + ylab('Lot Area')
yr <- condense(bin(yrBuilt, 3), z=lotArea)
autoplot(yr) + xlim(1900, 2014) + ylim(0, 10000) + ylab('Lot Area')
df <- ince
library(ggplot2)
df$State <- factor(df$State, levels=rev(levels(df$State)))
ggplot(df, aes(factor(State))) + geom_histogram() + xlab("State") + ylab("Total Companies Residing") + ggtitle("5,000 Fastest Growing Companies in the US") + coord_flip()
df <- inc
library(ggplot2)
df$State <- factor(df$State, levels=rev(levels(df$State)))
ggplot(df, aes(factor(State))) + geom_histogram() + xlab("State") + ylab("Total Companies Residing") + ggtitle("5,000 Fastest Growing Companies in the US") + coord_flip()
rev(sort(table(df$State)))[3]
ny <-subset(df[complete.cases(df),], State=='NY')
library(plyr)
ind.mean.sd <- ddply(ny, ~Industry, summarise, mean=mean(Employees), sd=sd(Employees))
ny.merged <- merge(ind.mean.sd, ny, by="Industry")
cust.remove <- function(num, mean, sd){
if(is.na(sd)){
return (num)
}
if (num > mean + sd | num < mean - sd){
return (NA)
}
else{
return (num)
}
}
ny.merged$Employees <- sapply(1:nrow(ny.merged), function(x) cust.remove(ny.merged$Employees[x], ny.merged$mean[x], ny.merged$sd[x]))
ny.clean<- ny.merged[!is.na(ny.merged$Employees),]
ggplot(ny.clean, aes(x=factor(Industry), y=Employees)) + geom_boxplot()
#This is useful still
ind.mean.sd[which(ind.mean.sd$Industry == 'Health'),]$mean
stand.dev <- sd(ny$Employees)
m <- mean(ny$Employees)
ny <- ny[-which(x$Employees > m + (10 * stand.dev)),]
ggplot(ny, aes(x=factor(Industry), y=Employees)) + geom_boxplot()
ggplot(ny, aes(x=factor(Industry))) + geom_boxplot(aes(lower=mean-sd, upper=mean + sd, middle=mean, ymin = min, ymax=max), stat="identity")
ny
cust.remove <- function(num){
outs <- boxplot.stats(num)$out
num[!which(num %in% outs)] <- NA
}
ny$Employees <- sapply(1:nrow(ny), function(x) cust.remove(ny$Employees[x]))
newny <- ddply(ny, ~Industry, cust.remove(ny$Employees))
aggregate(df$Employees, by=list(df$Industry), FUN=boxplot.stats)
boxplot.stats(df$Employees[which(df$Industry == "Health")])$out
boxplot(df$Employees[which(df$Industry == "Health")])
ny.merged$Employees <- sapply(1:nrow(ny.merged), function(x) cust.remove(ny.merged$Employees[x], ny.merged$mean[x], ny.merged$sd[x]))
ny.clean<- ny.merged[!is.na(ny.merged$Employees),]
ggplot(ny.clean, aes(x=factor(Industry), y=Employees)) + geom_boxplot()
ggplot(df, aes(factor(State))) + geom_histogram() +
xlab("State") + ylab("Total Companies Residing") +
ggtitle("5,000 Fastest Growing Companies in the US") + coord_flip()
ggplot(ny, aes(x=factor(Industry))) + geom_boxplot(aes(lower=mean-sd, upper=mean + sd, middle=mean, ymin = min, ymax=max), stat="identity")
ny <- ny[-which(x$Employees > m + (10 * stand.dev)),]
ggplot(ny, aes(x=factor(Industry), y=Employees)) + geom_boxplot()
ggplot(ny, aes(x=factor(Industry))) + geom_boxplot(aes(lower=mean-sd, upper=mean + sd, middle=mean, ymin = min, ymax=max), stat="identity")
library(ggplot2)
library(plyr)
library(scales)
inc5000data <- inc
inc5000complete <- inc5000data[complete.cases(inc5000data),]
q1_data <- ddply(inc5000data, "State", function(df)count(df$State))
inc5000data <- read.csv("lecture1/data/inc5000_data.csv", header= TRUE)
setwd("/Users/JL/Box Sync/CUNY/CUNY_IS608")
inc5000data <- read.csv("lecture1/data/inc5000_data.csv", header= TRUE)
inc5000complete <- inc5000data[complete.cases(inc5000data),]
#### Question 1 ####
q1_data <- ddply(inc5000data, "State", function(df)count(df$State))
q1_data <- ddply(inc5000complete, "State", function(df)count(df$State))
ggplot(data=q1_data, aes(x=reorder(State,freq), y=freq)) + geom_bar(stat='identity') + coord_flip() + xlab("State") + ylab("Companies") + ggtitle("Companies by State")
inc5000complete
q1_data <- ddply(inc5000complete, .(State), function(df)count(df$State))
q1_data <- ddply(inc5000complete, .(State), function(df){count(df$State)})
q1_data <- ddply(inc5000complete, .(State), count)
ggplot(data=q1_data, aes(x=reorder(State,freq), y=freq)) + geom_bar(stat='identity') + coord_flip() + xlab("State") + ylab("Companies") + ggtitle("Companies by State")
#### Question 2 ####
q2_src_data <- inc5000complete[inc5000complete$State == "NY",c("Industry","Employees")]
ggplot(data=q2_src_data, aes(x=reorder(Industry,Employees, FUN=median), y=Employees)) + geom_boxplot(outlier.shape = NA) + scale_y_continuous(limits = quantile(q2_src_data$Employees, c(0.1, 0.9)), labels = comma) + coord_flip() + xlab("Industry") + ylab("Employees") + ggtitle("Employees Per Company by Industry (NY)")
q3_src_data <- inc5000complete[,c("Industry","Employees", "Revenue")]
q3_data <- ddply(q3_src_data,"Industry",transform, avg = Revenue/Employees)
ggplot(data=q3_data, aes(x=reorder(Industry, avg, FUN=median), y=avg)) + geom_boxplot(outlier.shape = NA) + scale_y_continuous(limits = quantile(q3_data$avg, c(0.1, 0.9)), labels = comma) + coord_flip() + xlab("Industry") + ylab("Revenue Per Employee") + ggtitle("Revenue Per Employee by Industry")
companies <- read.csv("lecture1/data/inc5000_data.csv", header= TRUE)
num_of_companies_by_state <- count(companies, vars = "State")
num_of_companies_by_state_desc <- arrange(num_of_companies_by_state,
desc(freq))
num_of_companies_by_state <- transform(num_of_companies_by_state,
State = reorder(State, freq))
#Question 1 Graph
ggplot(num_of_companies_by_state, aes(x = State, y = freq)) +
geom_bar(stat = "identity") + coord_flip() +
theme(aspect.ratio = 2) + ylab("Number of Companies") +
ggtitle("Fastest Growing Companies \nin the U.S. - by State")
#State with the 3rd most companies in the data set
state_3rd_most <- num_of_companies_by_state_desc[[3,1]]
state_3rd_most
#Number of companies in the state with the 3rd most companies
state_3rd_most_num_comp <- num_of_companies_by_state_desc[[3,2]]
state_3rd_most_num_comp
state_3rd_most_df <- ddply(companies, "State",
function(x) x[x$State == state_3rd_most,])
state_3rd_most_boxplot <- boxplot(state_3rd_most_df$Employees)
#Outliers to remove
state_3rd_most_boxplot$out
replace_outliers_NA <- rep(NA, length(state_3rd_most_boxplot$out))
state_3rd_most_Employees <- state_3rd_most_df$Employees
#Replace outliers with NA
state_3rd_most_df$Employees <- mapvalues(state_3rd_most_Employees,
state_3rd_most_boxplot$out,
replace_outliers_NA,
warn_missing = FALSE)
#Remove incomplete cases (NA/outliers)
state_3rd_most_no_outliers_df <- state_3rd_most_df[complete.cases(state_3rd_most_df),]
state_3rd_most_avg_emp_df <- ddply(state_3rd_most_no_outliers_df,
"Industry", function(x) mean(x$Employees))
colnames(state_3rd_most_avg_emp_df)[2] <- "Average"
state_3rd_most_emp_min <- ddply(state_3rd_most_no_outliers_df,
"Industry", function(x) min(x$Employees))
colnames(state_3rd_most_emp_min)[2] <- "Minimum"
state_3rd_most_emp_max <- ddply(state_3rd_most_no_outliers_df,
"Industry", function(x) max(x$Employees))
colnames(state_3rd_most_emp_max)[2] <- "Maximum"
state_3rd_most_avg_emp_df$Minimum <- state_3rd_most_emp_min$Minimum
state_3rd_most_avg_emp_df$Maximum <- state_3rd_most_emp_max$Maximum
#Question 2 Graph
ggplot(state_3rd_most_avg_emp_df, aes(x = Industry, y = Average, color = "Average")) +
geom_point() +
geom_point(aes(y = state_3rd_most_avg_emp_df$Minimum, color = "Minimum")) +
geom_point(aes(y = state_3rd_most_avg_emp_df$Maximum, color = "Maximum")) +
theme(legend.title = element_blank()) +
ylab("Number of Employees") +
ggtitle(paste("Employment by Industry for Companies in", state_3rd_most)) +
coord_flip()
companies_only_full_data <- companies[complete.cases(companies),]
revenue_per_employee <- ddply(companies_only_full_data, "Industry",
function(x) sum(x$Revenue)/sum(x$Employees))
colnames(revenue_per_employee)[2] <- "Revenue_per_Employee"
revenue_per_employee <- transform(revenue_per_employee,
Industry = reorder(Industry, Revenue_per_Employee))
#Question 3 Graph
ggplot(revenue_per_employee, aes(x = Industry, y = Revenue_per_Employee)) +
geom_bar(stat = "identity") + ylab("Revenue per Employee") +
ggtitle("Revenue per Employee - by Industry") + coord_flip()
ggplot(state_3rd_most_avg_emp_df, aes(x = Industry, y = Average, color = "Average")) +
geom_point() +
geom_point(aes(y = state_3rd_most_avg_emp_df$Minimum, color = "Minimum")) +
geom_point(aes(y = state_3rd_most_avg_emp_df$Maximum, color = "Maximum")) +
theme(legend.title = element_blank()) +
ylab("Number of Employees") +
ggtitle(paste("Employment by Industry for Companies in", state_3rd_most)) +
coord_flip()
require(plyr)
require(ggplot2)
require(data.table)
data.file <- read.csv("lecture1/data/inc5000_data.csv", header= TRUE)
raw.data <- data.file
inc_state <- ddply(raw.data, .(State), summarize, count = length(Rank))
inc_state <- inc_state[order(inc_state$count),] #count
dtinc <- data.table(raw.data)
install.packages('data.table')
require(data.table)
dtinc <- data.table(raw.data)
#plot using bars and counts as the x-axis
fig1<-ggplot(data=inc_state, aes(x=State, y=count))+
geom_bar(stat="identity", fill="white", colour="darkgreen")+
coord_flip()+
ggtitle("Number of Companies in Each State")
fig1
fig1<-ggplot(data=inc_state, aes(x=State, y=count))+
geom_bar(stat="identity", fill="white", color="darkgreen")+
coord_flip()+
ggtitle("Number of Companies in Each State")
fig1
#save
ggsave(fig1, file=paste("/Users/bcarancibia/CUNY_IS_608/Module_1/data/", "figure1_Arancibia.png", sep=""), height=11, width=8, dpi=100, scale=0.75)
####Question 2####
#by looking at the data list grouping inc_state find that New York has 3rd most companies in the dataset
#look to see how many people employed companies in different industries
#exclude outliers
#isolate New York
ny <- dtinc[dtinc$State == "NY"]
#look for outliers
qplot(Name, Employees, data = ny, geom ="jitter")
#exclude outliers (which is anything greater than 300 Employees)
outlier = 300
ny <- ny[ny$Employees < outlier,]
#visual show change
qplot(Name, Employees, data = ny, geom ="jitter")
#average employment by industry in New York state
ny_average_ind <- ddply(ny, .(Industry), summarize,
average_employees = mean(Employees),
num_companies = length(Employees))
ny_average_ind<- ny_average_ind[order(ny_average_ind$average_employees, decreasing=TRUE),]
fig2<-ggplot(data=ny_average_ind, aes(x=Industry, y=average_employees))+
geom_bar(stat="identity", colour="darkgreen")+
coord_flip()+
ggtitle("Average Employment by Industry in the State of New York")
fig2
#save figure 2 to folder
ggsave(fig2, file=paste("/Users/bcarancibia/CUNY_IS_608/Module_1/data/", "figure2_Arancibia.png", sep=""), height=11, width=8, dpi=100, scale=0.75)
####Question 3####
#Want to see which industries generate the most revenue per employee
#use the same ddply above just add revenue per employee
#since I am an investor I want to know growth rate, doesn't make sense to invest in something that has high revenue for employee but not growing
ny_rev_employee <- ddply(ny, .(Industry), summarize,
average_employees = mean(Employees),
num_companies = length(Employees),
total_employed = sum(Employees),
average_revenue = mean(Revenue/Employees),
growth = mean(Growth_Rate)
)
ny_rev_employee<- ny_rev_employee[order(ny_rev_employee$average_revenue, decreasing=TRUE),]
#plot it using scatter plot
fig3 <- ggplot(ny_rev_employee, aes(average_revenue, Industry))+ geom_point(aes(size = growth))+
ggtitle("Average Revenue and Growth per Industry")
fig3