In [5]:
#This script opens up AQ PM2.5 data from the from the AirNow network
#and filters and format the data accordingly

In [6]:
#Important libraries and settings. Set pd.option chain error messages off.
#Python CopyWarning being ridiculous for no good reason. 
import pandas as pd
import numpy as np


In [7]:
#Read in AQ data as a pandas data frame, and clean it up a bit as we read it in.
output_file = "aq_obs_88101_08_2020.pkl"
filepath = '/uufs/chpc.utah.edu/common/home/u0703457/lin-group7/dvm/projects/UDAQ_2020-22/obs/AirNow/'
filename = 'hourly_88101_PM25_2020.csv'
filename = filepath+filename

aq_dat = pd.read_csv(filename,sep=",",usecols=['State Code','County Code','Site Num','Latitude','Longitude','Date GMT','Time GMT','Sample Measurement'],parse_dates=[['Date GMT', 'Time GMT']])
aq_dat = aq_dat.rename(columns={'Latitude': 'lat','Longitude':'lon','Date GMT_Time GMT':'Time','Sample Measurement':'pm25'}) 
print('Size of data frame = '+str(np.shape(aq_dat)))

Size of data frame = (5732898, 7)


In [8]:
#Subset data based on longitude, as we really just want to toss data that fall outside of the western US,
#and outside of the time slot that we are most interested in. 
aq_sub = aq_dat[(aq_dat['lon'] >= -112.5) & (aq_dat['lon'] < -111.5)]
aq_sub = aq_sub[(aq_sub['lat'] >= 40) & (aq_sub['lat'] < 41)]
aq_sub = aq_sub[aq_sub['Time'].between('2020-07-22', '2020-07-23')]

In [9]:
print('Size of data frame = '+str(np.shape(aq_sub)))

Size of data frame = (248, 7)


In [11]:
#Compute the mean, max and min values
print('The max PM2.5 measurement is :'+str(np.max(aq_sub['pm25'])))
print('The minumum PM2.5 measurement is :'+str(np.min(aq_sub['pm25'])))
print('The mean PM2.5 measurement is :'+str(np.mean(aq_sub['pm25'])))

The max PM2.5 measurement is :24.9
The minumum PM2.5 measurement is :2.8
The mean PM2.5 measurement is :9.88266129032258


In [7]:
#Weird that we have negative concentrations, two ways we can deal with this...
aq_sub[aq_sub['pm25'] < 0] = np.nan

In [8]:
print('The max PM2.5 measurement is :'+str(np.max(aq_sub['pm25'])))
print('The minumum PM2.5 measurement is :'+str(np.min(aq_sub['pm25'])))
print('The minumum PM2.5 measurement is :'+str(np.mean(aq_sub['pm25'])))

The max PM2.5 measurement is :146.6
The minumum PM2.5 measurement is :0.0
The minumum PM2.5 measurement is :16.958832498366377


In [9]:
#or do this...
aq_sub = aq_sub[(aq_sub['pm25'] > 0)]

In [10]:
print('The max PM2.5 measurement is :'+str(np.max(aq_sub['pm25'])))
print('The minumum PM2.5 measurement is :'+str(np.min(aq_sub['pm25'])))
print('The minumum PM2.5 measurement is :'+str(np.mean(aq_sub['pm25'])))

The max PM2.5 measurement is :146.6
The minumum PM2.5 measurement is :0.0
The minumum PM2.5 measurement is :16.958832498366377


In [11]:
import matplotlib.pyplot as plt

plt.hist(aq_sub['pm25'],20)
plt.show()

<Figure size 640x480 with 1 Axes>

In [13]:
print(aq_sub)

                       Time  State Code  County Code  Site Num        lat  \
5061417 2020-07-22 00:00:00          49           11         4  40.902967   
5061418 2020-07-22 01:00:00          49           11         4  40.902967   
5061419 2020-07-22 02:00:00          49           11         4  40.902967   
5061420 2020-07-22 03:00:00          49           11         4  40.902967   
5061421 2020-07-22 04:00:00          49           11         4  40.902967   
...                     ...         ...          ...       ...        ...   
5187881 2020-07-22 20:00:00          49           49      5010  40.136336   
5187882 2020-07-22 21:00:00          49           49      5010  40.136336   
5187883 2020-07-22 22:00:00          49           49      5010  40.136336   
5187884 2020-07-22 23:00:00          49           49      5010  40.136336   
5187885 2020-07-23 00:00:00          49           49      5010  40.136336   

                lon  pm25  
5061417 -111.884467  10.4  
5061418 -111.884467