Synopsis

Storm data spanning from 1950-2011 was acquired from the National Weather Service. This data was processed and analyzed to determine which weather related events were the most costly to human life as well as property and crops.

It was found that tornadoes had the greatest health impact, with a total of 5,640 fatalities and 91,439 injuries, while floods had the greatest economic impact, with a total of ~$215 billion in property damage and ~$12 billion in crop damage.

The repository for this project can be found on my github.

Data Processing

Getting the Data

The original dataset can be found here (dim = [902297, 37])

library(dplyr)
library(data.table)
library(ggplot2)
library(kableExtra)
if(!file.exists("./data")){dir.create("./data")}
fileUrl <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
if(!file.exists("./data/stormdata.csv.bz2")){download.file(fileUrl, "./data/stormdata.csv.bz2", method = "curl")}
stormData <- read.csv("./data/stormdata.csv.bz2")

Cleaning EVTYPE

There is a lot of overlap between categories (Ex. tornadoes involve rain, hail, thunder and lightning) so the order was set up in terms of priority (i.e. what is the likely cause of damage given these properties of the storm). Both “Flood” and “Drought” are symptoms of weather patters, and were placed at the end to remain separate from weather descriptions. A flood or drought may be caused by a number of factors, however the root cause of a tsunami or hurricane would be a more honest description for the source of that damage.

TORNADOS <- grepl("tornado|gustnado|nado|whirlwind|dust devil|spout|wayterspout|funnel|wall cloud|torndao", stormData$EVTYPE, ignore.case = T)
HURRICANES <- grepl("hurricane|typhoon|DEAN|depression|alberto|gordon|Jerry|Tropical storm|floyd", stormData$EVTYPE, ignore.case = T)
MICROBURSTS <- grepl("microburst|downburst|wet microburst|wet micoburst", stormData$EVTYPE, ignore.case = T)
THUNDERSTORMS <- grepl("lightning|thunderstorm|tstm|lighting|tunderstorm|thundestorm|thunderstrom|thundertsorm|thuderstorm|thundeerstorm|thunderestorm|thunerstorm|ligntning|thundertorm|drowning|metro storm", stormData$EVTYPE, ignore.case = T)
TSUNAMI <- grepl("seiche|tsunami", stormData$EVTYPE, ignore.case = T)
WAVE <- grepl("current|tide|heavy seas|surf|swell|wave|marine mishap|marine accident|wake|wave|seas$", stormData$EVTYPE, ignore.case = T)
AVALANCHE <- grepl("avalanche|avalance", stormData$EVTYPE, ignore.case = T)
VOLCANO <- grepl("volcanic|vog", stormData$EVTYPE, ignore.case = T)
TURBULENCE <- grepl("turbulence", stormData$EVTYPE, ignore.case = T)
DAMBREAK <- grepl("dam failure|dam break", stormData$EVTYPE, ignore.case = T)
WINTER <- grepl("ice|icy|winter|snow|sleet|blizzard|frost|wintry|glaze", stormData$EVTYPE, ignore.case = T)
EROSION <- grepl("erosion|erosin|mud slide|mudslide|landslide|landslump|rock slide", stormData$EVTYPE, ignore.case = T)
FLOOD <- grepl("flood|high water|rapidly rising water|surge|coastal|floood|fld", stormData$EVTYPE, ignore.case = T)
DROUGHT <- grepl("drought|dry|driest|^excessive$", stormData$EVTYPE, ignore.case = T)
HAIL <- grepl("hail|pellets", stormData$EVTYPE, ignore.case = T)
FOG <- grepl("fog", stormData$EVTYPE, ignore.case = T)
FIRE <- grepl("fire|smoke", stormData$EVTYPE, ignore.case = T)
RAIN <- grepl("wet weather|heavy rainfall|precipitation|mixed precip|rain|drizzle|heavy shower|heavy mix|wetness|precipatation|wet year|wet month|unseasonably wet|abnormally wet|extremely wet", stormData$EVTYPE, ignore.case = T)
WIND <- grepl("wind damage|high wind|strong wind|gradient wind|^winds$|gusty wind|wind gust|wind advisory|^wind$|gusty lake wind|^ wind$|wnd|wind storm|high  winds|force winds", stormData$EVTYPE, ignore.case = T)
HOT <- grepl("heat|record high|high temp|warm|hot", stormData$EVTYPE, ignore.case = T)
COLD <- grepl("cool|low|frost|hypothermia|hyperthermia|chill|freez|heavy mix|cold", stormData$EVTYPE, ignore.case = T)
DUST <- grepl("dust", stormData$EVTYPE, ignore.case = T)
OTHER <- grepl("other", stormData$EVTYPE, ignore.case = T)

stormData$CATEGORY <- vector("character", length(stormData$EVTYPE))
stormData$CATEGORY[TORNADOS] <- "TORNADO"
stormData$CATEGORY[HURRICANES] <- "HURRICANE"
stormData$CATEGORY[TSUNAMI] <- "TSUNAMI"
stormData$CATEGORY[MICROBURSTS] <- "MICROBURST"
stormData$CATEGORY[THUNDERSTORMS] <- "THUNDERSTORM"
stormData$CATEGORY[WAVE] <- "WAVES"
stormData$CATEGORY[AVALANCHE] <- "AVALANCHE"
stormData$CATEGORY[VOLCANO] <- "VOLCANO"
stormData$CATEGORY[TURBULENCE] <- "TURBULENCE"
stormData$CATEGORY[DAMBREAK] <- "DAMBREAK"
stormData$CATEGORY[WINTER] <- "WINTER STORMS"
stormData$CATEGORY[EROSION] <- "EROSION"
stormData$CATEGORY[FLOOD] <- "FLOOD"
stormData$CATEGORY[DROUGHT] <- "DROUGHT"
stormData$CATEGORY[HAIL] <- "HAIL"
stormData$CATEGORY[FOG] <- "FOG"
stormData$CATEGORY[FIRE] <- "FIRE"
stormData$CATEGORY[RAIN] <- "RAIN"
stormData$CATEGORY[WIND] <- "WIND"
stormData$CATEGORY[HOT] <- "HEAT"
stormData$CATEGORY[COLD] <- "COLD"
stormData$CATEGORY[DUST] <- "DUST"
stormData$CATEGORY[OTHER] <- "OTHER"
stormData$CATEGORY[which(stormData$CATEGORY == "")] <- "OTHER"

Cleaning Damage Columns

For this part I had to do some research considering the EXP columns for damage have values that are not mentioned in the describing documents. I used this analysis as a reference to determine the values of the unintuitive exponent values. This resource was used to compare our dataset to the actual values originally published.

stormData$PROPDMGEXP <- gsub("\\?", 0, stormData$PROPDMGEXP)
stormData$PROPDMGEXP <- gsub("\\-", 0, stormData$PROPDMGEXP)
stormData$PROPDMGEXP <- gsub("\\+", 1, stormData$PROPDMGEXP)
stormData$PROPDMGEXP <- gsub("0|1|2|3|4|5|6|7|8", 10, stormData$PROPDMGEXP)
stormData$PROPDMGEXP <- gsub("H|h", 100, stormData$PROPDMGEXP)
stormData$PROPDMGEXP <- gsub("K|k", 1000, stormData$PROPDMGEXP)
stormData$PROPDMGEXP <- gsub("M|m", 1000000, stormData$PROPDMGEXP)
stormData$PROPDMGEXP <- gsub("B|b", 1000000000, stormData$PROPDMGEXP)
stormData$PROPDMGEXP[which(stormData$PROPDMGEXP == "")] <- 0
stormData$PROPDMG[which(stormData$PROPDMG == "")] <- 0

stormData$CROPDMGEXP <- gsub("\\?", 0, stormData$CROPDMGEXP)
stormData$CROPDMGEXP <- gsub("\\-", 0, stormData$CROPDMGEXP)
stormData$CROPDMGEXP <- gsub("\\+", 1, stormData$CROPDMGEXP)
stormData$CROPDMGEXP <- gsub("0|1|2|3|4|5|6|7|8", 10, stormData$CROPDMGEXP)
stormData$CROPDMGEXP <- gsub("H|h", 100, stormData$CROPDMGEXP)
stormData$CROPDMGEXP <- gsub("K|k", 1000, stormData$CROPDMGEXP)
stormData$CROPDMGEXP <- gsub("M|m", 1000000, stormData$CROPDMGEXP)
stormData$CROPDMGEXP <- gsub("B|b", 1000000000, stormData$CROPDMGEXP)
stormData$CROPDMGEXP[which(stormData$CROPDMGEXP == "")] <- 0
stormData$CROPDMG[which(stormData$CROPDMG == "")] <- 0

stormData$PROPDMGCOST <- stormData$PROPDMG * as.numeric(stormData$PROPDMGEXP)
stormData$CROPDMGCOST <- stormData$CROPDMG * as.numeric(stormData$CROPDMGEXP)

Results

Harm Impact

The original data was grouped and summarized into the variable EVdata. New columns were also calculated to better understand the proportion of each events harm effects relative to the total harm by all events. The data was sorted by the average of these proportions.

EVdata <- stormData %>%
    group_by(CATEGORY) %>% 
    summarise(FATALITIES = sum(FATALITIES), INJURIES = sum(INJURIES)) %>%
    mutate(FATALITY.PROPORTION = round((FATALITIES/sum(FATALITIES)*100), digits = 1), INJURY.PROPORTION = round((INJURIES/sum(INJURIES)*100), digits = 1), PROPORTION.AVERAGE = ((FATALITIES/sum(FATALITIES)*100) + (INJURIES/sum(INJURIES)*100)) / 2) %>%
    arrange(desc(PROPORTION.AVERAGE)) #%>%
    #rename(EVENT.TYPE = EVTYPE)

This table shows the actual values and relative proportions of harm.

The numbers of individuals were formatted with “,” characters to appear more readable.

EVdataTable <- EVdata
EVdataTable$FATALITIES <- formatC(EVdata$FATALITIES, big.mark = ",", format = "fg")
EVdataTable$INJURIES<- formatC(EVdata$INJURIES, big.mark = ",", format = "fg")
EVdataTable$FATALITY.PROPORTION <- paste0(EVdataTable$FATALITY.PROPORTION, "%")
EVdataTable$INJURY.PROPORTION <- paste0(EVdataTable$INJURY.PROPORTION, "%")

kable(EVdataTable[, 1:5], caption = "Table 1. Fatality and Injuries due to Natural Events from 1950-2011") %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Table 1. Fatality and Injuries due to Natural Events from 1950-2011
CATEGORY FATALITIES INJURIES FATALITY.PROPORTION INJURY.PROPORTION
TORNADO 5,640 91,439 37.2% 65.1%
HEAT 3,178 9,243 21% 6.6%
THUNDERSTORM 1,543 14,679 10.2% 10.4%
FLOOD 1,584 8,728 10.5% 6.2%
WINTER STORMS 632 6,264 4.2% 4.5%
WAVES 764 799 5% 0.6%
WIND 454 1,948 3% 1.4%
COLD 491 380 3.2% 0.3%
HURRICANE 199 1,714 1.3% 1.2%
FIRE 90 1,608 0.6% 1.1%
AVALANCHE 225 170 1.5% 0.1%
HAIL 45 1,467 0.3% 1%
FOG 80 1,076 0.5% 0.8%
RAIN 108 308 0.7% 0.2%
DUST 24 483 0.2% 0.3%
EROSION 44 55 0.3% 0%
TSUNAMI 33 129 0.2% 0.1%
OTHER 8 5 0.1% 0%
DROUGHT 3 33 0% 0%
DAMBREAK 0 0 0% 0%
MICROBURST 0 0 0% 0%
TURBULENCE 0 0 0% 0%
VOLCANO 0 0 0% 0%

This graph shows the fatalities and injuries associated with each event type. The data is sorted by the average of the injury and fatality relative proportions for each event.

Tornados are revealed to cause much more harm than other events (both respect to deaths and injuries), however heat related weather, thunderstorms, floods and winter storms also

graphdata <- melt(setDT(EVdata[, c(1:3, 5)]), 
                  id.vars = c("CATEGORY"), 
                  measure.vars = c("FATALITIES", "INJURIES"),
                  variable.name = "HARM.TYPE")

#didn't work, merge PROPORTION.AVERAGE to sort
graphdata <- merge(EVdata[, c(1, 6)], graphdata)

ggplot(graphdata, aes(x = reorder(factor(CATEGORY), -PROPORTION.AVERAGE), y = value, fill = HARM.TYPE)) +
    geom_col(position = "stack") + 
    theme(axis.text.x = element_text(angle = 90)) +
    labs(x = "Event Type", 
         y = "Number of People", 
         title = "Fatality and Injuries due to Natural Events from 1950-2011", 
         fill = "Harm Type")

Figure 1. Fatality and Injuries due to Natural Events from 1950-2011

Economic Impact

The original data was grouped and summarized into the variable COSTdata. The data was sorted by the total cost of damage for each event type (property + crop).

COSTdata <- stormData %>%
    group_by(CATEGORY) %>% 
    summarise(PROPERTY.DAMAGE = sum(PROPDMGCOST), CROP.DAMAGE = sum(CROPDMGCOST)) %>%
    mutate(TOTAL.COST = PROPERTY.DAMAGE + CROP.DAMAGE, PROPORTION = round((TOTAL.COST/sum(TOTAL.COST)) * 100, digits = 1)) %>%
    arrange(desc(TOTAL.COST)) 

A table is included for this data as well, showing the true values of cost associated with each event type. Units were included and numerical values were separated with a “,” character for easy readability.

COSTdataTable <- COSTdata
COSTdataTable$PROPERTY.DAMAGE <- formatC(COSTdata$PROPERTY.DAMAGE, big.mark = ",", format = "fg")
COSTdataTable$CROP.DAMAGE <- formatC(COSTdataTable$CROP.DAMAGE, big.mark = ",", format = "fg")
COSTdataTable$TOTAL.COST <- formatC(COSTdataTable$TOTAL.COST, big.mark = ",", format = "fg")

COSTdataTable$PROPERTY.DAMAGE <- paste0("$", COSTdataTable$PROPERTY.DAMAGE)
COSTdataTable$CROP.DAMAGE <- paste0("$", COSTdataTable$CROP.DAMAGE)
COSTdataTable$TOTAL.COST <- paste0("$", COSTdataTable$TOTAL.COST)
COSTdataTable$PROPORTION <- paste0(COSTdataTable$PROPORTION, "%")

kable(COSTdataTable, caption = "Table 2. Economic Cost due to Natural Events from 1950-2011 (USD)") %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Table 2. Economic Cost due to Natural Events from 1950-2011 (USD)
CATEGORY PROPERTY.DAMAGE CROP.DAMAGE TOTAL.COST PROPORTION
FLOOD $215,532,626,823 $12,276,279,200 $227,808,906,023 47.8%
HURRICANE $92,972,462,560 $6,201,013,800 $99,173,476,360 20.8%
TORNADO $57,002,980,987 $414,964,460 $57,417,945,447 12.1%
HAIL $17,619,973,877 $3,114,213,050 $20,734,186,927 4.4%
WINTER STORMS $12,350,972,477 $5,311,281,400 $17,662,253,877 3.7%
DROUGHT $1,052,838,600 $13,972,581,000 $15,025,419,600 3.2%
THUNDERSTORM $11,859,665,139 $1,218,951,540 $13,078,616,679 2.7%
FIRE $8,501,728,500 $403,281,630 $8,905,010,130 1.9%
WIND $6,250,686,290 $765,345,400 $7,016,031,690 1.5%
RAIN $3,247,684,210 $1,061,305,800 $4,308,990,010 0.9%
COLD $286,928,950 $3,435,826,550 $3,722,755,500 0.8%
HEAT $20,325,750 $904,479,280 $924,805,030 0.2%
EROSION $327,496,100 $20,017,000 $347,513,100 0.1%
TSUNAMI $145,042,000 $20,000 $145,062,000 0%
WAVES $112,225,000 $0 $112,225,000 0%
FOG $22,829,500 $0 $22,829,500 0%
DUST $6,338,130 $3,600,000 $9,938,130 0%
AVALANCHE $3,721,800 $0 $3,721,800 0%
OTHER $75,550 $1,034,400 $1,109,950 0%
DAMBREAK $1,002,000 $0 $1,002,000 0%
MICROBURST $567,000 $0 $567,000 0%
VOLCANO $500,000 $0 $500,000 0%
TURBULENCE $50,000 $0 $50,000 0%

This plot shows the total cost in billions of dollars due to each natural event. The greatest economic consequences are shown to come from floods, followed by hurricanes and tornadoes.

ggplot(COSTdata, aes(x = reorder(factor(CATEGORY), -TOTAL.COST), 
                     y = TOTAL.COST/1000000000, 
                     fill = TOTAL.COST/1000000000)) +
    geom_col() + 
    theme(axis.text.x = element_text(angle = 90)) + 
    labs(x = "Event Type", 
         y = "Cost of Damage (Billions of Dollars)", 
         title = "Economic Cost due to Natural Events from 1950-2011", 
         fill = "Billions of Dollars")