# Import pandas to read csv file and manage heterogenous data
import pandas as pd

# Import numpy to store numeric information and perform numerical analysis
import numpy as np

# Import seaborn and matplotlib to visualize data
import seaborn as sns
import matplotlib.pyplot as plt

# Import numpy to store numeric information and perform numerical analysis
import numpy as np

# Import seaborn and matplotlib to visualize data
import seaborn as sns
import matplotlib.pyplot as plt

#Import scipy to gather statistics
from scipy import stats

# Import patsy and statsmodels for regression analysis
import patsy
import statsmodels.api as sm

# Import math for using some math function
import math

import warnings

import os

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, precision_recall_fscore_support
from sklearn.model_selection import train_test_split

# Load the California wildfire incidents data set in data frame
# We get this data set from Kaggle (https://www.kaggle.com/ananthu017/california-wildfire-incidents-20132020)
wildfire = pd.read_csv("California_Fire_Incidents.csv")


# Load the US weather station ID data set in data frame
# We get the Integrated Surface Data (ISD) station list from ncdc.noaa.gov
station = pd.read_csv("https://www1.ncdc.noaa.gov/pub/data/noaa/isd-history.csv")



# Load the US weather daily data set from 2013 to 2019 in data frame
# We get this data from ncei.noaa.gov and download to the local.
# (https://www.ncei.noaa.gov/data/global-summary-of-the-day/archive/)
for dirname, _, filenames in os.walk('/Users/wenbohu/Desktop/Weather'):
    for filename in filenames:
        print((os.path.join(dirname, filename)))
        
# get all subdiretory of all tables
file_dict ={}
for path, dirs, files in os.walk('/Users/wenbohu/Desktop/Weather', topdown=False):
    file_dict[path]=files
    
paths = list(file_dict.keys())

events = []
for path in paths:
    events += [os.path.join(path,file) for file in file_dict[path]]

# delete the irrelevant columns
wildfire = wildfire[['AcresBurned','Started','Counties', 'Latitude', 'Longitude']]

# change the started time into date
wildfire['Started'] = [x[0:10] for x in wildfire['Started']]

# change the 'Started' column name into 'Date'
wildfire = wildfire.rename({'Started':'Date'}, axis='columns')

#drop the null values 
wildfire['Latitude'] = wildfire['Latitude'].apply(lambda x: np.nan if x == 0 else x)
wildfire = wildfire.dropna().reset_index(drop=True)

wildfire.head()

# Since the weather station ID is a combination of column 'USAF' and 'WBAN',
# we combine these two columns into a new column called 'ID'
station['ID']= station['USAF'].astype(str) + station['WBAN'].astype(str)

# we only analyze California weather
station = station[(station['STATE']=='CA') & (station['CTRY']=='US')].reset_index(drop=True)

# station only need to include the ID and the nameof the station
pd.set_option("max_rows", None)

station.head()

IDlist = []
for i,j in wildfire.iterrows():
    before = len(IDlist)
    for a,b in station.iterrows():
        #about 50km * 40km when 0.5 then tried 0.3, 0.1, and 0.2
        if (((b['LAT'] <= j['Latitude'] + 0.2) and (b['LAT'] >= j['Latitude'] - 0.2)) 
        and (( b['LON'] <= j['Longitude'] + 0.2) and ( b['LON'] >= j['Longitude'] - 0.2))):
            IDlist.append(b['ID'])
            break
    after = len(IDlist)
    if (before == after):
        IDlist.append("Not_Found")

# 0.5-95 0.3-119 0.1-119 0.2-127(THIS IS THE BEST!!!!)
# when 0.1 it's also 119 but lots of not found values
unique = []
for x in IDlist:
    if x not in unique:
        unique.append(x)
print(len(unique))

127

IDlist = []
row_fire = []
join_id =[]
num = 1
# create a new dataframe to store these matched data
matched_wildfire = pd.DataFrame(columns=wildfire.columns)

# iterate the rows in wildfire and station to find the matched data
for i,j in wildfire.iterrows():
    for a,b in station.iterrows():
        #about 22.2km * 20km area from fire accidents   
        if (((b['LAT'] <= j['Latitude'] + 0.2) and (b['LAT'] >= j['Latitude'] - 0.2)) 
        and (( b['LON'] <= j['Longitude'] + 0.2) and ( b['LON'] >= j['Longitude'] - 0.2))):
            IDlist.append(b['ID'])
            join_id.append(num)
            num += 1
            row_fire.append(list(j))
            break
        
matched_wildfire = matched_wildfire.append(pd.DataFrame(row_fire,columns=wildfire.columns))
matched_wildfire = matched_wildfire.assign(Station_ID = IDlist)
matched_wildfire = matched_wildfire.assign(Join_ID=join_id)

matched_wildfire.head()

example = pd.read_csv("/Users/wenbohu/Desktop/Weather/2013/40854099999.csv")
example.head()

# Get the weather information of wildfire start date 
row_weather = []
join_id = []
num = 1

# create a data frame to store the weather data
weather = pd.DataFrame(columns = example.columns)

for i, j in matched_wildfire.iterrows():
    for file in events:
        if num in join_id:
            break
        if (file[-15:-4] == j['Station_ID']):
            temp = pd.read_csv(file)
            for a, b in temp.iterrows():
                if (b["DATE"] == j['Date']):
                    row_weather.append(list(b))
                    join_id.append(num)
                    break 
    num += 1

weather = pd.DataFrame(columns = example.columns)
weather = weather.append(pd.DataFrame(row_weather, columns = example.columns))
weather = weather.assign(Join_ID = join_id)

weather.head()

dataframe = matched_wildfire.merge(weather, on = 'Join_ID')
dataframe.to_csv('/Users/wenbohu/Desktop/df.csv')

df = pd.read_csv('df.csv')

df = df[['AcresBurned', 'Date', 'Counties','TEMP','DEWP','WDSP','PRCP']] 

# By the column description, we replace the missing value (9999.9 or 999.9) with np.nan
df['TEMP'] = df['TEMP'].replace(9999.9, np.nan, regex=True) 
df['DEWP'] = df['DEWP'].replace(9999.9, np.nan, regex=True) 
df['WDSP'] = df['WDSP'].replace(999.9, np.nan, regex=True) 
# Drop the NAN value
df = df.dropna(subset=['TEMP','DEWP','WDSP'])

# convert the temperature and the dewpoint from Fahrenheit to Celsius
df['TEMP'] = 5.0 / 9.0 * (df['TEMP'] - 32.0)
df['DEWP'] = 5.0 / 9.0 * (df['DEWP'] - 32.0)

# calculate saturation vapor pressure(Es) and actual vapor pressure(E) in millibars.
df['Es'] = 6.11*10.0**(7.5*df['TEMP']/(237.7+df['TEMP']))
df['E'] = 6.11*10.0**(7.5*df['DEWP']/(237.7+df['DEWP']))

# Once you have the saturation vapor pressure and the actual vapor pressure, 
# relative humidity(RH) can be computed by dividing the actual vapor pressure by the saturation vapor pressure 
# and then multiplying by 100 to convert the quantity to a percent.
df ['RelaHumPct'] = (df['E']/df['Es'])*100

# Rename the columns
df = df.rename(columns={'Counties':'County','TEMP':'Temp','DEWP':'DewPt','WDSP': 'WindSpd','PRCP':'Precipitation'})
df = df[['AcresBurned','Date','County','Temp','WindSpd','Precipitation','RelaHumPct']]

df.head()

# determine the shape of the data
df.shape

(372, 7)

# get descriptive statistics for quantitative variables
df.describe()

# Checking the data type
df.dtypes

AcresBurned      float64
Date              object
County            object
Temp             float64
WindSpd          float64
Precipitation    float64
RelaHumPct       float64
dtype: object

# first look at the scatter matrix
scatter = pd.plotting.scatter_matrix(df, figsize=(15, 10))

/opt/conda/lib/python3.7/site-packages/pandas/plotting/_matplotlib/tools.py:307: MatplotlibDeprecationWarning: 
The rowNum attribute was deprecated in Matplotlib 3.2 and will be removed two minor releases later. Use ax.get_subplotspec().rowspan.start instead.
  layout[ax.rowNum, ax.colNum] = ax.get_visible()
/opt/conda/lib/python3.7/site-packages/pandas/plotting/_matplotlib/tools.py:307: MatplotlibDeprecationWarning: 
The colNum attribute was deprecated in Matplotlib 3.2 and will be removed two minor releases later. Use ax.get_subplotspec().colspan.start instead.
  layout[ax.rowNum, ax.colNum] = ax.get_visible()
/opt/conda/lib/python3.7/site-packages/pandas/plotting/_matplotlib/tools.py:313: MatplotlibDeprecationWarning: 
The rowNum attribute was deprecated in Matplotlib 3.2 and will be removed two minor releases later. Use ax.get_subplotspec().rowspan.start instead.
  if not layout[ax.rowNum + 1, ax.colNum]:
/opt/conda/lib/python3.7/site-packages/pandas/plotting/_matplotlib/tools.py:313: MatplotlibDeprecationWarning: 
The colNum attribute was deprecated in Matplotlib 3.2 and will be removed two minor releases later. Use ax.get_subplotspec().colspan.start instead.
  if not layout[ax.rowNum + 1, ax.colNum]:

# look at burning areas (in acres) distributions
sns.boxplot(x=df['AcresBurned']);

outlier = df[df['AcresBurned'] == df['AcresBurned'].max()]
outlier

df = df.drop_duplicates(subset=['Date','AcresBurned'])

# AcresBurned
ax = sns.distplot(df['AcresBurned'], hist=True, kde=True, 
                  bins=int(len(df['AcresBurned'])/10),
                  color='red',hist_kws={'edgecolor':'black'}, 
                  kde_kws={'linewidth':4})

ax.set(xlabel='Acres Burned', ylabel='Frequency', title='Acres Burned Distribution')
plt.show()

# look at temperature distributions
ax = sns.distplot(df['Temp'], hist=True, kde=True, 
                  bins=int(len(df['Temp'])/10),
                  color='blue',hist_kws={'edgecolor':'black'}, 
                  kde_kws={'linewidth':4})

ax.set(xlabel='Temperature', ylabel='Density', title='Temperature Distribution')
plt.show()

# look at wind speed distributions
ax = sns.distplot(df['WindSpd'], hist=True, kde=True, 
                  bins=int(len(df['WindSpd'])/10),
                  color='brown',hist_kws={'edgecolor':'black'}, 
                  kde_kws={'linewidth':4})

ax.set(xlabel='Wind Speed', ylabel='Density', title='Wind Speed Distribution')
plt.show()

# look at air relative humidity (in percentage) distributions
ax = sns.distplot(df['RelaHumPct'], hist=True, kde=True, 
                  bins=int(len(df['RelaHumPct'])/10),
                  color='orange',hist_kws={'edgecolor':'black'}, 
                  kde_kws={'linewidth':4})

ax.set(xlabel='Relative Humidity Percentage', ylabel='Density', title='Relative Humidity Percent Distribution')
plt.show()

sns.boxplot(x=df['Precipitation']);

df['Date'] = pd.to_datetime(df['Date'])
df['Month'] = df['Date'].dt.month
sns.lineplot(x = 'Month', y = 'AcresBurned', data = df);

# Relationship between the arces burned and temperatures of wildfire incidents
sns.lmplot(x = 'Temp',
           y = 'AcresBurned',
           data = df, 
           fit_reg = True, 
           height = 6,
           aspect = 1.7, 
           line_kws={'color': 'purple'},
           scatter_kws={'color': 'purple'})
plt.ylabel('Acres of Land Affected By Wildfires', size=18)
plt.xlabel('The Mean Temperature of The Day (in Celsius)', size=18)
plt.title('');

df['AcresBurned_log10'] = np.log10(df['AcresBurned'] + 0.1)

sns.distplot(df['AcresBurned_log10'], 
             kde=True, bins=40, color="red");

# this should show you all of the papers with zero pulitzer prizes
df[df['AcresBurned_log10'] < 0]

# plot relationship between the burning area and temperature 
sns.lmplot(y = 'AcresBurned_log10',
           x = 'Temp',
           data = df, 
           fit_reg = True, 
           height = 6,
           aspect = 1.7, 
           line_kws={'color': 'blue'},
           scatter_kws={'color': 'blue'})
plt.ylabel('Acres of Land Affected By Wildfires', size=18)
plt.xlabel('The Mean Temperature of The Day (in Celsius)', size=18)
plt.title('');

# carry out linear regression
acres_temp = df[['Temp', 'AcresBurned_log10']]
acres_temp.columns = ['Temp', 'AcresBurned_log10']

outcome, predictors = patsy.dmatrices('AcresBurned_log10 ~ Temp', acres_temp)

# Now use statsmodels to intialize an OLS linear model
# This step initializes the model, and provides the data (but does not actually compute the model)
mod_log = sm.OLS(outcome, predictors)

# fit the model
res_log = mod_log.fit()

# Check out the results
print(res_log.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:      AcresBurned_log10   R-squared:                       0.002
Model:                            OLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.9026
Date:                Mon, 15 Mar 2021   Prob (F-statistic):              0.343
Time:                        21:37:39   Log-Likelihood:                -499.33
No. Observations:                 372   AIC:                             1003.
Df Residuals:                     370   BIC:                             1011.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      1.8873      0.213      8.865      0.000       1.469       2.306
Temp           0.0087      0.009      0.950      0.343      -0.009       0.027
==============================================================================
Omnibus:                       84.292   Durbin-Watson:                   0.392
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              194.747
Skew:                           1.131   Prob(JB):                     5.14e-43
Kurtosis:                       5.729   Cond. No.                         103.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

# Relationship between the arces burned and temperatures of wildfire incidents
sns.lmplot(y = 'AcresBurned',
           x = 'WindSpd',
           data = df, 
           fit_reg = True, 
           height = 6,
           aspect = 1.7, 
           line_kws={'color': 'brown'},
           scatter_kws={'color': 'brown'})
plt.ylabel('Acres of Land Affected By Wildfires', size=18)
plt.xlabel('The Mean Wind Speed of The Day (in Knots)', size=18)
plt.title('');

# Relationship between the arces burned and temperatures of wildfire incidents
sns.lmplot(y = 'AcresBurned_log10',
           x = 'WindSpd',
           data = df, 
           fit_reg = True, 
           height = 6,
           aspect = 1.7, 
           line_kws={'color': 'brown'},
           scatter_kws={'color': 'brown'})
plt.ylabel('Acres of Land Affected By Wildfires', size=18)
plt.xlabel('The Mean Wind Speed of The Day (in knots)', size=18)
plt.title('');

# carry out linear regression
acres_windspd = df[['WindSpd', 'AcresBurned_log10']]
acres_windspd.columns = ['WindSpd', 'AcresBurned_log10']

outcome, predictors = patsy.dmatrices('AcresBurned_log10 ~ WindSpd', acres_windspd)

# Now use statsmodels to intialize an OLS linear model
# This step initializes the model, and provides the data (but does not actually compute the model)
mod_log = sm.OLS(outcome, predictors)

# fit the model
res_log = mod_log.fit()

# Check out the results
print(res_log.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:      AcresBurned_log10   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.003
Method:                 Least Squares   F-statistic:                   0.04285
Date:                Mon, 15 Mar 2021   Prob (F-statistic):              0.836
Time:                        21:38:53   Log-Likelihood:                -499.76
No. Observations:                 372   AIC:                             1004.
Df Residuals:                     370   BIC:                             1011.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      2.0635      0.112     18.494      0.000       1.844       2.283
WindSpd        0.0035      0.017      0.207      0.836      -0.029       0.036
==============================================================================
Omnibus:                       84.386   Durbin-Watson:                   0.393
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              202.619
Skew:                           1.116   Prob(JB):                     1.00e-44
Kurtosis:                       5.845   Cond. No.                         15.7
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

sns.lmplot(y = 'AcresBurned',
           x = 'RelaHumPct',
           data = df, 
           fit_reg = True, 
           height = 6,
           aspect = 1.7, 
           line_kws={'color': 'orange'},
           scatter_kws={'color': 'orange'})
plt.ylabel('Acres of Land Affected By Wildfires', size=18)
plt.xlabel('The Relative Humidity Percentage of The Day', size=18)
plt.title('');

sns.lmplot(y = 'AcresBurned_log10',
           x = 'RelaHumPct',
           data = df, 
           fit_reg = True, 
           height = 6,
           aspect = 1.7, 
           line_kws={'color': 'orange'},
           scatter_kws={'color': 'orange'})
plt.ylabel('Acres of Land Affected By Wildfires', size=18)
plt.xlabel('The Relative Humidity Percentage of The Day', size=18)
plt.title('');

# carry out linear regression
acres_rhp = df[['RelaHumPct', 'AcresBurned_log10']]
acres_rhp.columns = ['RelaHumPct', 'AcresBurned_log10']

outcome, predictors = patsy.dmatrices('AcresBurned_log10 ~ RelaHumPct', acres_rhp)

# Now use statsmodels to intialize an OLS linear model
# This step initializes the model, and provides the data (but does not actually compute the model)
mod_log = sm.OLS(outcome, predictors)

# fit the model
res_log = mod_log.fit()

# Check out the results
print(res_log.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:      AcresBurned_log10   R-squared:                       0.031
Model:                            OLS   Adj. R-squared:                  0.028
Method:                 Least Squares   F-statistic:                     11.80
Date:                Mon, 15 Mar 2021   Prob (F-statistic):           0.000661
Time:                        21:39:25   Log-Likelihood:                -493.95
No. Observations:                 372   AIC:                             991.9
Df Residuals:                     370   BIC:                             999.7
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      2.4831      0.125     19.795      0.000       2.236       2.730
RelaHumPct    -0.0095      0.003     -3.435      0.001      -0.015      -0.004
==============================================================================
Omnibus:                       80.570   Durbin-Watson:                   0.455
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              190.830
Skew:                           1.072   Prob(JB):                     3.65e-42
Kurtosis:                       5.778   Cond. No.                         120.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

#Combine Temperature, Wind Speed
df['Temp_Wsp'] = df['Temp'] + df['WindSpd']

sns.lmplot(y = 'AcresBurned_log10',
           x = 'Temp_Wsp',
           data = df, 
           fit_reg = True, 
           height = 6,
           aspect = 1.7, 
           line_kws={'color': 'green'},
           scatter_kws={'color': 'green'})
plt.ylabel('Acres of Land Affected By Wildfires', size=18)
plt.xlabel('Combined Temperature, Wind Speed', size=18)
plt.title('');

# carry out linear regression
acres_rhp = df[['Temp_Wsp', 'AcresBurned_log10']]
acres_rhp.columns = ['Temp_Wsp', 'AcresBurned_log10']

outcome, predictors = patsy.dmatrices('AcresBurned_log10 ~ Temp_Wsp', acres_rhp)

# Now use statsmodels to intialize an OLS linear model
# This step initializes the model, and provides the data (but does not actually compute the model)
mod_log = sm.OLS(outcome, predictors)

# fit the model
res_log = mod_log.fit()

# Check out the results
print(res_log.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:      AcresBurned_log10   R-squared:                       0.003
Model:                            OLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.9749
Date:                Mon, 15 Mar 2021   Prob (F-statistic):              0.324
Time:                        21:40:06   Log-Likelihood:                -499.30
No. Observations:                 372   AIC:                             1003.
Df Residuals:                     370   BIC:                             1010.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      1.8438      0.248      7.427      0.000       1.356       2.332
Temp_Wsp       0.0084      0.009      0.987      0.324      -0.008       0.025
==============================================================================
Omnibus:                       84.435   Durbin-Watson:                   0.398
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              200.153
Skew:                           1.122   Prob(JB):                     3.45e-44
Kurtosis:                       5.807   Cond. No.                         151.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

#Combine Relative Humidity Percentage, Temperature
df['Humid_Temp'] = df['RelaHumPct'] + df['Temp']

sns.lmplot(y = 'AcresBurned_log10',
           x = 'Humid_Temp',
           data = df, 
           fit_reg = True, 
           height = 6,
           aspect = 1.7, 
           line_kws={'color': 'green'},
           scatter_kws={'color': 'green'})
plt.ylabel('Acres of Land Affected By Wildfires', size=18)
plt.xlabel('Combined Relative Humidity Percentage, Temperature', size=18)
plt.title('');

# carry out linear regression
acres_rhp = df[['Humid_Temp', 'AcresBurned_log10']]
acres_rhp.columns = ['Humid_Temp', 'AcresBurned_log10']

outcome, predictors = patsy.dmatrices('AcresBurned_log10 ~ Humid_Temp', acres_rhp)

# Now use statsmodels to intialize an OLS linear model
# This step initializes the model, and provides the data (but does not actually compute the model)
mod_log = sm.OLS(outcome, predictors)

# fit the model
res_log = mod_log.fit()

# Check out the results
print(res_log.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:      AcresBurned_log10   R-squared:                       0.031
Model:                            OLS   Adj. R-squared:                  0.028
Method:                 Least Squares   F-statistic:                     11.68
Date:                Mon, 15 Mar 2021   Prob (F-statistic):           0.000701
Time:                        21:40:37   Log-Likelihood:                -494.00
No. Observations:                 372   AIC:                             992.0
Df Residuals:                     370   BIC:                             999.8
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      2.7497      0.200     13.722      0.000       2.356       3.144
Humid_Temp    -0.0103      0.003     -3.418      0.001      -0.016      -0.004
==============================================================================
Omnibus:                       80.481   Durbin-Watson:                   0.456
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              197.293
Skew:                           1.057   Prob(JB):                     1.44e-43
Kurtosis:                       5.874   Cond. No.                         281.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

#Combine Relative Humidity Percentage, Wind Speed
df['Humid_Wsp'] = df['RelaHumPct'] + df['WindSpd']

sns.lmplot(y = 'AcresBurned_log10',
           x = 'Humid_Wsp',
           data = df, 
           fit_reg = True, 
           height = 6,
           aspect = 1.7, 
           line_kws={'color': 'green'},
           scatter_kws={'color': 'green'})
plt.ylabel('Acres of Land Affected By Wildfires', size=18)
plt.xlabel('Combined Relative Humidity Percentage, Wind Speed', size=18)
plt.title('');

# carry out linear regression
acres_rhp = df[['Humid_Wsp', 'AcresBurned_log10']]
acres_rhp.columns = ['Humid_Wsp', 'AcresBurned_log10']

outcome, predictors = patsy.dmatrices('AcresBurned_log10 ~ Humid_Wsp', acres_rhp)

# Now use statsmodels to intialize an OLS linear model
# This step initializes the model, and provides the data (but does not actually compute the model)
mod_log = sm.OLS(outcome, predictors)

# fit the model
res_log = mod_log.fit()

# Check out the results
print(res_log.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:      AcresBurned_log10   R-squared:                       0.030
Model:                            OLS   Adj. R-squared:                  0.028
Method:                 Least Squares   F-statistic:                     11.60
Date:                Mon, 15 Mar 2021   Prob (F-statistic):           0.000734
Time:                        21:41:01   Log-Likelihood:                -494.05
No. Observations:                 372   AIC:                             992.1
Df Residuals:                     370   BIC:                             999.9
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      2.5374      0.141     17.962      0.000       2.260       2.815
Humid_Wsp     -0.0094      0.003     -3.405      0.001      -0.015      -0.004
==============================================================================
Omnibus:                       80.101   Durbin-Watson:                   0.446
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              184.957
Skew:                           1.076   Prob(JB):                     6.87e-41
Kurtosis:                       5.702   Cond. No.                         152.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

#Combine Relative Humidity Percentage, Temperature, Wind Speed
df['Humid_Temp_Wsp'] = df['RelaHumPct'] + df['Temp'] + df['WindSpd']

sns.lmplot(y = 'AcresBurned_log10',
           x = 'Humid_Temp_Wsp',
           data = df, 
           fit_reg = True, 
           height = 6,
           aspect = 1.7, 
           line_kws={'color': 'green'},
           scatter_kws={'color': 'green'})
plt.ylabel('Acres of Land Affected By Wildfires', size=18)
plt.xlabel('The Relative Humidity Percentage, Temperature, Wind Speed', size=18)
plt.title('');

# carry out linear regression
acres_rhp = df[['Humid_Temp_Wsp', 'AcresBurned_log10']]
acres_rhp.columns = ['Humid_Temp_Wsp', 'AcresBurned_log10']

outcome, predictors = patsy.dmatrices('AcresBurned_log10 ~ Humid_Temp_Wsp', acres_rhp)

# Now use statsmodels to intialize an OLS linear model
# This step initializes the model, and provides the data (but does not actually compute the model)
mod_log = sm.OLS(outcome, predictors)

# fit the model
res_log = mod_log.fit()

# Check out the results
print(res_log.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:      AcresBurned_log10   R-squared:                       0.021
Model:                            OLS   Adj. R-squared:                  0.018
Method:                 Least Squares   F-statistic:                     7.575
Date:                Sat, 13 Mar 2021   Prob (F-statistic):            0.00622
Time:                        20:39:09   Log-Likelihood:                -439.18
No. Observations:                 358   AIC:                             882.4
Df Residuals:                     356   BIC:                             890.1
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==================================================================================
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept          2.5684      0.203     12.661      0.000       2.169       2.967
Humid_Temp_Wsp    -0.0077      0.003     -2.752      0.006      -0.013      -0.002
==============================================================================
Omnibus:                       59.269   Durbin-Watson:                   0.540
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              149.649
Skew:                           0.801   Prob(JB):                     3.19e-33
Kurtosis:                       5.733   Cond. No.                         336.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

def set_class(area):
    if area < 1/4:
        return "A"
    if area < 10:
        return "B"
    if area < 100:
        return "C"
    if area < 300:
        return "D"
    if area < 1000:
        return "E"
    if area < 5000:
        return "F"
    else:
        return "G"

# apply the set_class() on 'AcresBurned' and store in a new column called 'class'
df['class'] = df['AcresBurned'].apply(set_class)

#data partition 
df_X = df[['WindSpd','RelaHumPct']]
df_Y = df['class']

X_train, X_test = train_test_split(df_X, test_size = 0.2, random_state = 10)
Y_train, Y_test = train_test_split(df_Y, test_size = 0.2, random_state = 10)

def train_DT(X, Y):
    clf = DecisionTreeClassifier()
    clf.fit(X, Y)
    return clf

#train the decision tree model
model = train_DT(X_train, Y_train)

#take a look at the report for train data
pred_train = model.predict(X_train)
print(classification_report(Y_train, pred_train))

              precision    recall  f1-score   support

           A       1.00      1.00      1.00         3
           C       0.99      1.00      1.00       171
           D       1.00      1.00      1.00        56
           E       0.96      1.00      0.98        22
           F       1.00      0.96      0.98        23
           G       1.00      0.95      0.98        22

    accuracy                           0.99       297
   macro avg       0.99      0.99      0.99       297
weighted avg       0.99      0.99      0.99       297

#take a look at the report for test data
pred_test = model.predict(X_test)
print(classification_report(Y_test, pred_test))

              precision    recall  f1-score   support

           A       0.00      0.00      0.00         0
           C       0.58      0.64      0.61        39
           D       0.29      0.20      0.24        20
           E       0.14      0.11      0.12         9
           F       0.67      0.67      0.67         3
           G       0.43      0.75      0.55         4

    accuracy                           0.47        75
   macro avg       0.35      0.39      0.36        75
weighted avg       0.45      0.47      0.45        75

/opt/conda/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1272: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))

	AcresBurned	Date	Counties	Latitude	Longitude
0	257314.0	2013-08-17	Tuolumne	37.857000	-120.086000
1	30274.0	2013-05-30	Los Angeles	34.585595	-118.423176
2	27531.0	2013-07-15	Riverside	33.709500	-116.728850
3	27440.0	2013-08-10	Placer	39.120000	-120.650000
4	22992.0	2013-07-22	Fresno	37.279000	-119.318000

	USAF	WBAN	STATION NAME	CTRY	STATE	ICAO	LAT	LON	ELEV(M)	BEGIN	END	ID
0	690020	93218	JOLON HUNTER LIGGETT MIL RES	US	CA	KHGT	36.000	-121.233	317.0	19640715	19970401	69002093218
1	690020	99999	JOLON HUNTER LIGGETT MIL RES	US	CA	KHGT	36.000	-121.233	317.0	20030702	20030801	69002099999
2	690070	93217	FRITZSCHE AAF	US	CA	KOAR	36.683	-121.767	43.0	19600404	19930831	69007093217
3	690140	93101	EL TORO MCAS	US	CA	KNZJ	33.667	-117.733	116.7	19890101	19990630	69014093101
4	690150	93121	TWENTY NINE PALMS	US	CA	KNXP	34.300	-116.167	625.1	19900102	20210312	69015093121

	AcresBurned	Date	Counties	Latitude	Longitude	Station_ID	Join_ID
0	30274.0	2013-05-30	Los Angeles	34.585595	-118.423176	99999923180	1
1	27440.0	2013-08-10	Placer	39.120000	-120.650000	72584523225	2
2	20292.0	2013-08-07	Riverside	33.861570	-116.904270	72068799999	3
3	8073.0	2013-09-09	Shasta	40.498332	-122.535496	9999994222	4
4	7055.0	2013-07-06	San Diego	32.954350	-116.473810	99999923178	5

	STATION	DATE	LATITUDE	LONGITUDE	ELEVATION	NAME	TEMP	TEMP_ATTRIBUTES	DEWP	DEWP_ATTRIBUTES	...	MXSPD	GUST	MAX	MIN	PRCP_ATTRIBUTES	SNDP
0	40854099999	2013-01-01	29.083925	58.450397	940.0	BAM, IR	44.1	8	23.9	8	...	7.8	999.9	56.1	36.3	G	999.9
1	40854099999	2013-01-02	29.083925	58.450397	940.0	BAM, IR	43.4	10	24.7	10	...	12.0	999.9	56.1	33.8	G	999.9
2	40854099999	2013-01-03	29.083925	58.450397	940.0	BAM, IR	43.9	7	21.5	7	...	7.8	999.9	56.1	36.3	G	999.9
3	40854099999	2013-01-04	29.083925	58.450397	940.0	BAM, IR	44.4	8	23.3	8	...	5.8	999.9	57.2	36.3	G	999.9
4	40854099999	2013-01-05	29.083925	58.450397	940.0	BAM, IR	45.2	10	24.5	10	...	12.0	999.9	57.2	35.6	G	999.9

	STATION	DATE	LATITUDE	LONGITUDE	ELEVATION	NAME	TEMP	TEMP_ATTRIBUTES	DEWP	DEWP_ATTRIBUTES	...	GUST	MAX	MIN	PRCP_ATTRIBUTES	SNDP	Join_ID
0	99999953150	2013-06-16	37.7592	-119.82080	2017.8	YOSEMITE VILLAGE 12 W, CA US	59.3	24	9999.9	0	...	999.9	67.8	52.9	G	999.9	10
1	72383023187	2013-07-19	34.7436	-118.72420	1374.6	SANDBERG, CA US	80.5	24	33.8	24	...	27.0	96.1	66.0	G	999.9	12
2	99999993243	2013-05-03	37.2381	-120.88250	23.8	MERCED 23 WSW, CA US	71.3	24	9999.9	0	...	999.9	93.6	46.2	G	999.9	18
3	74504893210	2013-10-04	39.4900	-121.61833	57.9	OROVILLE MUNICIPAL AIRPORT, CA US	66.2	24	27.4	24	...	27.0	82.9	51.1	G	999.9	19
4	74504893210	2013-06-03	39.4900	-121.61833	57.9	OROVILLE MUNICIPAL AIRPORT, CA US	80.8	24	51.1	24	...	999.9	96.1	61.0	G	999.9	20

	AcresBurned	Date	County	Temp	WindSpd	RelaHumPct
1	712.0	2013-07-19	Kern	26.944444	13.2	18.527509
3	305.0	2013-10-04	Butte	19.000000	10.1	23.086979
4	298.0	2013-06-03	Butte	27.111111	3.5	35.698961
5	240.0	2013-06-08	Alameda	28.666667	4.0	35.230183
6	200.0	2013-07-04	Tehama	36.500000	10.0	19.131822

	AcresBurned	Temp	WindSpd	Precipitation	RelaHumPct
count	372.000000	372.000000	372.000000	372.000000	372.000000
mean	6147.862903	22.626045	6.004301	0.005376	41.945580
std	42796.625726	5.260780	2.880510	0.074306	17.166863
min	0.000000	2.611111	0.300000	0.000000	6.578408
25%	33.000000	19.041667	4.300000	0.000000	29.354164
50%	78.000000	22.972222	5.400000	0.000000	38.853739
75%	241.750000	26.236111	6.900000	0.000000	53.300922
max	410203.000000	41.222222	22.000000	1.360000	91.355476

	AcresBurned	Date	County	Temp	WindSpd	RelaHumPct
221	410203.0	2018-07-27	Colusa	27.611111	4.5	27.269169
222	410203.0	2018-07-27	Glenn	27.611111	4.5	27.269169
223	410203.0	2018-07-27	Lake	27.611111	4.5	27.269169
224	410203.0	2018-07-27	Mendocino	27.611111	4.5	27.269169

	Date	County	Temp	WindSpd	Precipitation	RelaHumPct	AcresBurned_log10
101	2016-04-19	Los Angeles	20.111111	9.6	0.00	20.252206	-1.0
219	2017-05-19	San Luis Obispo	20.000000	6.1	0.00	32.720699	-1.0
220	2017-02-07	Butte	13.722222	19.7	1.36	88.698477	-1.0

Analysis and Prediction on California Wildfire and Climate Variables¶

Overview¶

Names¶

Research Question¶

Background & Prior Work¶

Hypothesis¶

Dataset(s)¶

Setup¶

Import Necessary Libraries¶

Import Data Sets¶

Data Cleaning¶

First, we clean the California wildfire incidents data set¶

Second, clean the Integrated Surface Data (ISD) station list¶

Thrid, we merge thewildfire and station dataframes by matching the LATITUDE and LONGTITUDE of the wildfire incident locations and weather stations.¶

Fourth, using the matched_wildfire data frame merge with the weather dataset from 2013 to 2019 so that every wildfire incident has the weather data of that day.¶

Lastly, import and clean this final data frame¶

The Final Dataframe¶

Data Analysis & Results¶

Descriptive Analysis¶

EDA: Exploratory Data Analysis¶

If we Google this date, we see that Mendocino complex fire affects four counties. It makes sense that the readership increased by 410203 there.¶

Inferential Analysis¶

Relationship between variables¶

Acres burned V.S. Month¶

Acres burned V.S. Temperature¶

Linear Regression with single climate feature¶

Acres burned V.S. Temperature¶

Acres burned V.S. Wind Speed¶

Acres burned V.S. Relative Humidity Percentage¶

Linear Regression with combined climate features¶

Acres burned V.S. Temperature, Wind Speed¶

Acres burned V.S. Relative Humidity Percentage, Temperature¶

Acres burned V.S. Relative Humidity Percentage, Wind Speed¶

Acres burned V.S. Relative Humidity Percentage, Temperature, Wind Speed¶

Results: Model Comparison and Interpretation¶

OLS models with single variable¶

Single variable vs. Multiple variables¶

OLS model with Relative Humidity Percent¶

A Further Step: Prediction¶

Results: Prediction summary¶

Model Assessment¶

Interpretation¶

Ethics & Privacy¶

Conclusion & Discussion¶

Team Contributions¶

Thrid, we merge the`wildfire` and `station` dataframes by matching the LATITUDE and LONGTITUDE of the wildfire incident locations and weather stations.¶

Fourth, using the `matched_wildfire` data frame merge with the weather dataset from 2013 to 2019 so that every wildfire incident has the weather data of that day.¶