{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Data Setup\n",
    "========="
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This notebook nicely aggregates data for analysis. This includes EM27 oof and side by side corrections, averaging kernels, and met data. For the Meyer 2025 paper (https://doi.org/10.1029/2025JD044398), this notebook creates the datasets available at https://doi.org/10.1029/2025JD044398. These datasets are subsequently used in the EM27_ratios.ipynb and Inventory.ipynb demonstrations. I create the datasets for the two EM27s I use in SLV, but the paper only uses the 'ha' instrument. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Import Packages\n",
    "import sys\n",
    "import pickle\n",
    "import os\n",
    "import pytz\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import matplotlib.ticker\n",
    "import matplotlib.dates as mdates\n",
    "import matplotlib.gridspec as gridspec\n",
    "import plotly.graph_objects as go\n",
    "from plotly.subplots import make_subplots\n",
    "import xarray as xr\n",
    "import warnings\n",
    "import json\n",
    "\n",
    "sys.path.append('/uufs/chpc.utah.edu/common/home/u0890904/LAIR_1/')\n",
    "from atmos.utils import met_utils, datetime_utils, plot_utils, regression_utils, df_utils, ac_utils\n",
    "\n",
    "pd.options.mode.chained_assignment = None\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Define some global variable like path, timezone, etc.\n",
    "project_path = '/uufs/chpc.utah.edu/common/home/u0890904/public_html/Projects/SLV_EM27_2025'\n",
    "project_data_path = os.path.join(project_path,'Data')\n",
    "timezone = 'US/Mountain'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# EM27 Data Setup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Define a plotting function\n",
    "def plot_side_by_side(merged_oof_df, corr_gases, true_inst, corr_inst, regressions, timezone, title_id,summary_keys=None):\n",
    "    dates = sorted(list(set(merged_oof_df.index.date)))\n",
    "    broken_dtranges = []\n",
    "    for date in dates:\n",
    "        dtstart_str = f'{date} 08:00:00'\n",
    "        dtend_str = f'{date} 21:00:00'\n",
    "        broken_dtranges.append([pd.to_datetime(dtstart_str).tz_localize(timezone), pd.to_datetime(dtend_str).tz_localize(timezone)])\n",
    "    # Find the largest difference between consecutive days\n",
    "    date_diffs = [(dates[i+1] - dates[i]).days for i in range(len(dates) - 1)]\n",
    "    max_diff_index = date_diffs.index(max(date_diffs)) + 1  # +1 to get the index of the day after the largest gap\n",
    "\n",
    "    fig = plt.figure(figsize=(20, 10))\n",
    "    gs = gridspec.GridSpec(len(corr_gases), 8, width_ratios=[.8, .8, .8, .8, .8, .8, .2, 1])\n",
    "    true_inst_color = 'tomato'\n",
    "    corr_inst_color = 'deepskyblue'\n",
    "    em27_marker_size = 3\n",
    "    gridalpha = 0.2\n",
    "    labsize = 12\n",
    "    d = .01  # how big to make the diagonal lines in axes coordinates\n",
    "\n",
    "    for row in range(len(corr_gases)):\n",
    "        gas = corr_gases[row]\n",
    "        for col in range(len(broken_dtranges)):\n",
    "            ax = plt.subplot(gs[row, col])\n",
    "            ax.scatter(merged_oof_df.index, merged_oof_df[f'{gas}_{true_inst}'], color=true_inst_color, s=em27_marker_size)\n",
    "            ax.scatter(merged_oof_df.index, merged_oof_df[f'{gas}_{corr_inst}'], color=corr_inst_color, s=em27_marker_size)\n",
    "            if col == 0:\n",
    "                alpha = 0.8\n",
    "            else:\n",
    "                alpha = 1\n",
    "            ax.set_xlim(broken_dtranges[col][0], broken_dtranges[col][1])\n",
    "            if col < len(broken_dtranges) - 1:\n",
    "                if col != max_diff_index - 1:\n",
    "                    ax.spines['right'].set_visible(False)\n",
    "                ax.yaxis.tick_left()\n",
    "            if col > 0:\n",
    "                ax.yaxis.set_visible(False)\n",
    "            if col > 0:\n",
    "                if col != max_diff_index:\n",
    "                    ax.spines['left'].set_visible(False)\n",
    "                ax.yaxis.tick_right()\n",
    "            if col < len(broken_dtranges) - 1:\n",
    "                kwargs = dict(transform=ax.transAxes, color='k', clip_on=False)\n",
    "                ax.plot((1 - d, 1 + d), (-d, +d), **kwargs)  # top-left diagonal\n",
    "                ax.plot((1 - d, 1 + d), (1 - d, 1 + d), **kwargs)  # bottom-left diagonal\n",
    "            if col > 0:\n",
    "                kwargs.update(transform=ax.transAxes)  # switch to the bottom axes\n",
    "                ax.plot((-d, d), (-d, +d), **kwargs)  # top-right diagonal\n",
    "                ax.plot((-d, d), (1 - d, 1 + d), **kwargs)  # bottom-right diagonal\n",
    "            ax.get_xaxis().set_ticklabels([])\n",
    "            ax.xaxis.set_major_locator(mdates.HourLocator(interval=2))\n",
    "            ax.set_ylabel(gas, size=labsize)\n",
    "            ax.tick_params(labelsize=labsize - 2)\n",
    "            if (row == 0) & (col == 5):\n",
    "                ax.scatter([], [], color=true_inst_color, s=30, label=f'EM27_{true_inst}')\n",
    "                ax.scatter([], [], color=corr_inst_color, s=30, label=f'EM27_{corr_inst}')\n",
    "                ax.legend(fontsize=labsize - 1)\n",
    "            if row == 2:\n",
    "                ax.tick_params(labelsize=labsize - 2)\n",
    "                ax.xaxis.set_major_locator(mdates.HourLocator(interval=2))\n",
    "                ax.xaxis.set_major_formatter(mdates.DateFormatter('%H', tz=pytz.timezone(timezone)))\n",
    "                ax.set_xlabel(f\"{broken_dtranges[col][0].strftime('%Y-%m-%d')} {merged_oof_df.index[0].strftime('%Z')}\", size=labsize)\n",
    "        ax = plt.subplot(gs[row, len(broken_dtranges)])\n",
    "        ax.set_visible(False)\n",
    "        ax = plt.subplot(gs[row, len(broken_dtranges) + 1])\n",
    "        x = f'{gas}_{corr_inst}'\n",
    "        y = f'{gas}_{true_inst}'\n",
    "        ax.scatter(merged_oof_df[x], merged_oof_df[y], color='k', s=2)\n",
    "        #plot_utils.plot_reg_on_ax(ax, regressions[gas], labsize=labsize, color='grey')\n",
    "        \n",
    "        plot_utils.add_single_regression_to_ax(ax,merged_oof_df,x,y,regressions[gas],summary_keys=summary_keys)\n",
    "        ax.set_xlabel(x, size=labsize)\n",
    "        ax.set_ylabel(y, size=labsize)\n",
    "        ax.tick_params(labelsize=labsize - 2)\n",
    "    plt.subplots_adjust(hspace=0.3, wspace=0.1)\n",
    "    fig.suptitle(f'Side By Side ({title_id}): {true_inst} , {corr_inst}', y=.93)\n",
    "    fig.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Define the oof paths\n",
    "em27_data_path = '/uufs/chpc.utah.edu/common/home/lin-group24/agm/EM27' #This holds the entire EM27 dataset and GGG/EGI retrieval results.\n",
    "project_oof_path = os.path.join(project_path,'Data','EM27_oof') #This holds just the oof files for convenience. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Define the site locations and instrument details (when and where they were deployed)\n",
    "site_locs = {'WBB':{'lat':40.768,'lon':-111.854,'masl':1464.9},\n",
    "             'DBK':{'lat':40.538,'lon':-112.070,'masl':1584},\n",
    "             'UUSYR':{'lat':41.089,'lon':-112.119,'masl':1286}}\n",
    "\n",
    "inst_details = {\n",
    "    'ha':[{'site':'WBB', 'dtr': datetime_utils.DateTimeRange('2022-05-01 00:00:00', '2025-02-01 00:00:00', timezone)}],\n",
    "    'ua':[{'site':'WBB', 'dtr': datetime_utils.DateTimeRange('2023-07-08 11:00:00', '2023-07-11 23:59:59', timezone)},\n",
    "           {'site':'DBK', 'dtr': datetime_utils.DateTimeRange('2023-07-12 00:00:00', '2023-08-11 23:59:59', timezone)},\n",
    "           {'site':'WBB', 'dtr': datetime_utils.DateTimeRange('2023-08-12 00:00:00', '2023-08-14 23:59:59', timezone)},\n",
    "           {'site':'WBB', 'dtr': datetime_utils.DateTimeRange('2024-07-20 00:00:00', '2024-07-25 23:59:59', timezone)},\n",
    "           {'site':'UUSYR', 'dtr': datetime_utils.DateTimeRange('2024-07-23 00:00:00', '2024-08-31 23:59:59', timezone)},\n",
    "           {'site':'WBB', 'dtr': datetime_utils.DateTimeRange('2024-09-01 00:00:00', '2024-09-07 23:59:59', timezone)},\n",
    "           ],\n",
    "}\n",
    "\n",
    "inst_ids = list(inst_details.keys())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Copy all of the oof files to a single folder for each instrument\n",
    "for inst_id in inst_ids:\n",
    "    inst_oof_cp_path = os.path.join(project_oof_path,inst_id)\n",
    "    if not os.path.exists(inst_oof_cp_path):\n",
    "        os.makedirs(inst_oof_cp_path)\n",
    "    if len(os.listdir(inst_oof_cp_path)) > 0:\n",
    "        raise ValueError('Path already exists, you may end up overwriting data')\n",
    "    inst_oof_og_path = os.path.join(em27_data_path,inst_id,'results')\n",
    "    ac_utils.copy_em27_oofs_to_singlefolder(inst_oof_og_path,inst_oof_cp_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Load all of the EM27 data defined in the instrument details\n",
    "cols_to_load = ['flag','solzen(deg)','xco(ppb)','xco(ppb)_error','xch4(ppm)','xch4(ppm)_error','xco2(ppm)','xco2(ppm)_error'] #pare down the dfs to just these cols\n",
    "cols_to_keep = ['solzen(deg)','site','xco(ppm)','xco(ppm)_error','xch4(ppm)','xch4(ppm)_error','xco2(ppm)','xco2(ppm)_error'] #cols to keep in the final df\n",
    "good_flags = [0,23] #these are the flags that are considered \"good\" here. 0 is the default \"good\" flag, 23 is the flag when nas are in the WSPD column (which we get elsewhere)\n",
    "oof_dfs = {} #store the oof dfs in a dictionary by instrument\n",
    "for inst_id in inst_ids: \n",
    "    oof_df = pd.DataFrame()\n",
    "    inst_oof_path = os.path.join(project_oof_path,inst_id)\n",
    "    my_oof_manager = ac_utils.oof_manager(inst_oof_path,timezone)\n",
    "    inst_det_list = inst_details[inst_id]\n",
    "    for inst_det in inst_det_list:\n",
    "        site = inst_det['site']\n",
    "        dtr = inst_det['dtr']\n",
    "        dt1 = dtr.start_dt\n",
    "        dt2 = dtr.end_dt\n",
    "        new_oof_df = my_oof_manager.load_oof_df_inrange(dt1,dt2,filter_flag_0 = False,cols_to_load = cols_to_load,print_out = True) #load the oof data in the specified range\n",
    "        new_oof_df = new_oof_df[new_oof_df['flag'].isin(good_flags)] #filter by the good flags\n",
    "        for col in new_oof_df.columns: #convert the ppb columns to ppm\n",
    "            if 'ppb' in col:\n",
    "                new_col_name = col.replace('ppb','ppm')\n",
    "                new_oof_df[new_col_name] = new_oof_df[col]/1000 \n",
    "        new_oof_df['site'] = site\n",
    "        new_oof_df = new_oof_df[cols_to_keep] #pare down the df to just the columns we want\n",
    "        oof_df = pd.concat([oof_df,new_oof_df]) #concatenate the new df to the old one\n",
    "    oof_dfs[inst_id] = oof_df #store the df in the dictionary"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Side By Side Correction "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "First find the corrections for each set of side-by-sides using regressions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Find the correction factors \n",
    "true_inst = 'ha'\n",
    "corr_inst = 'ua'\n",
    "sbs_loc = 'WBB'\n",
    "resample_interval = '5min'\n",
    "corr_gases = ['xco(ppm)','xch4(ppm)','xco2(ppm)']\n",
    "plot = True\n",
    "\n",
    "sbs_total_dtranges = [datetime_utils.DateTimeRange('2023-07-08 11:00:00', '2023-08-14 23:59:59', timezone),\n",
    "                      datetime_utils.DateTimeRange('2024-07-20 00:00:00', '2024-09-07 23:59:59', timezone)]\n",
    "sbs_details = []\n",
    "for sbs_total_dtrange in sbs_total_dtranges:\n",
    "    sbs_dt_ranges = ac_utils.find_sbs_ranges_inrange(inst_details,true_inst,corr_inst,sbs_total_dtrange)[sbs_loc]\n",
    "    sbs_oof_dfs = {}\n",
    "    for inst_id in [true_inst,corr_inst]:\n",
    "        sbs_oof_df = pd.DataFrame()\n",
    "        for sbs_dtr in sbs_dt_ranges:\n",
    "            new_oof_df = oof_dfs[inst_id].loc[(oof_dfs[inst_id].index >= sbs_dtr.start_dt) & (oof_dfs[inst_id].index <= sbs_dtr.end_dt)]\n",
    "            sbs_oof_df = pd.concat([sbs_oof_df,new_oof_df])\n",
    "        sbs_oof_dfs[inst_id] = sbs_oof_df.resample(resample_interval).mean(numeric_only=True)\n",
    "    merged_oof_df = ac_utils.merge_oofdfs(sbs_oof_dfs,dropna=True)\n",
    "    corr_regressions = {}\n",
    "    for corr_gas in corr_gases:\n",
    "        #corr_regressions[corr_gas] = ac.lin_regress_2(merged_oof_df,f'{corr_gas}_{corr_inst}',f'{corr_gas}_{true_inst}')\n",
    "        corr_regressions[corr_gas] = regression_utils.ols_regression(merged_oof_df,f'{corr_gas}_{corr_inst}',f'{corr_gas}_{true_inst}')\n",
    "    if plot:\n",
    "        fig = plot_side_by_side(merged_oof_df, corr_gases, true_inst, corr_inst, corr_regressions, timezone, 'No Correction',summary_keys=['slope','r_squared'])\n",
    "    \n",
    "    corr_merged_oof_df = merged_oof_df.copy()\n",
    "    for corr_gas in corr_gases:\n",
    "        corr_merged_oof_df[f'{corr_gas}_{corr_inst}'] = corr_merged_oof_df.apply(\n",
    "            lambda row: row[f'{corr_gas}_{corr_inst}']*corr_regressions[corr_gas]['slope'] + corr_regressions[corr_gas]['intercept']\n",
    "            ,axis = 1)\n",
    "        \n",
    "    post_corr_regressions = {}\n",
    "    for corr_gas in corr_gases:\n",
    "        post_corr_regressions[corr_gas] = regression_utils.ols_regression(corr_merged_oof_df,f'{corr_gas}_{corr_inst}',f'{corr_gas}_{true_inst}')\n",
    "        #ac.lin_regress_2(corr_merged_oof_df,f'{corr_gas}_{true_inst}',f'{corr_gas}_{corr_inst}')\n",
    "\n",
    "    if plot:\n",
    "        fig = plot_side_by_side(corr_merged_oof_df, corr_gases, true_inst, corr_inst, post_corr_regressions, timezone, 'With Correction',summary_keys=['slope','r_squared'])\n",
    "    \n",
    "    sbs_details.append({'sbs_total_dtrange':sbs_total_dtrange,\n",
    "                        'corr_regressions':corr_regressions})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now apply the corrections to the data based on the applicable datetime ranges and the regression coefficients"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Apply the corrections to the entire dataset\n",
    "corr_oof_dfs = {}\n",
    "corr_oof_dfs[true_inst] = oof_dfs[true_inst].copy()\n",
    "oof_df = oof_dfs[corr_inst].copy()\n",
    "corr_oof_df = pd.DataFrame()\n",
    "for sbs_detail in sbs_details:\n",
    "    dtr = sbs_detail['sbs_total_dtrange']\n",
    "    corr_regressions = sbs_detail['corr_regressions']\n",
    "    sub_oof_df = oof_df.loc[(oof_df.index >= dtr.start_dt) & (oof_df.index <= dtr.end_dt)]\n",
    "    for corr_gas in corr_gases:\n",
    "        sub_oof_df[corr_gas] = sub_oof_df[corr_gas] * corr_regressions[corr_gas]['slope'] + corr_regressions[corr_gas]['intercept']\n",
    "    corr_oof_df = pd.concat([corr_oof_df,sub_oof_df])\n",
    "corr_oof_dfs[corr_inst] = corr_oof_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Clip to the correct date ranges\n",
    "for inst_id, inst_det_list in inst_details.items():\n",
    "    for inst_det in inst_det_list:\n",
    "        site = inst_det['site']\n",
    "        dtr = inst_det['dtr']\n",
    "        corr_oof_dfs[inst_id].loc[(corr_oof_dfs[inst_id].index >= dtr.start_dt) & (corr_oof_dfs[inst_id].index <= dtr.end_dt), 'site'] = site"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Use the cell below to do a check plot with plotly to make sure the corrections are working as expected"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Plotly plot to check that everything lines up where it should\n",
    "plot_corr_oof_dfs = {}\n",
    "for inst_id,corr_oof_df in corr_oof_dfs.items():\n",
    "    plot_corr_oof_dfs[inst_id] = corr_oof_dfs[inst_id].copy().resample(resample_interval).mean(numeric_only=True).dropna()\n",
    "fig = make_subplots(rows=3, cols=1, subplot_titles=corr_gases,shared_xaxes=True)\n",
    "for i,corr_gas in enumerate(corr_gases):\n",
    "    for inst_id,plot_corr_oof_df in plot_corr_oof_dfs.items():\n",
    "        if inst_id == true_inst:\n",
    "            color = 'tomato'\n",
    "        else:\n",
    "            color = 'deepskyblue'\n",
    "        fig.add_trace(go.Scatter(\n",
    "            x=plot_corr_oof_df.index, \n",
    "            y=plot_corr_oof_df[corr_gas], \n",
    "            mode='markers', \n",
    "            marker_color=color,\n",
    "            name=f'{inst_id}_{corr_gas}'), \n",
    "            row=i+1, col=1)\n",
    "fig.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Write to Parquet File"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Concat the dfs:\n",
    "corr_oof_df_concat = df_utils.concat_dict_of_dfs(corr_oof_dfs,drop_index=False)\n",
    "\n",
    "# Write to Parquet File\n",
    "corr_oof_df_concat.to_parquet(os.path.join(project_data_path,'corr_oof_df_concat.parquet'),index = False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Averaging Kernels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# This pulls out the surface averaging kernels from the netcdf files created by the retrieval. \n",
    "# This can take a while because it has to open every netcdf file and I never streamlined the code \n",
    "\n",
    "xgas_names = ['xco','xch4','xco2']\n",
    "surface_ak_dfs = {}\n",
    "for inst_id in inst_details.keys():\n",
    "    print(inst_id)\n",
    "    surface_ak_df = pd.DataFrame()\n",
    "    daily_path = os.path.join(em27_data_path,inst_id,'results','daily')\n",
    "    for datestr in sorted(os.listdir(daily_path)):\n",
    "        date_path = os.path.join(daily_path,datestr)\n",
    "        print(datestr)\n",
    "        try:\n",
    "            nc_fname = [file for file in os.listdir(date_path) if file.endswith('.nc')][0]\n",
    "        except:\n",
    "            continue\n",
    "        ds = xr.open_dataset(os.path.join(date_path,nc_fname))\n",
    "        surface_ak_ds = xr.Dataset()\n",
    "        for xgas_name in xgas_names:\n",
    "            surface_ak_interp = ac_utils.get_surface_ak(ds,xgas_name)\n",
    "            surface_ak_ds[f'{xgas_name}_surf_ak'] = surface_ak_interp\n",
    "        df = surface_ak_ds.to_dataframe()\n",
    "        surface_ak_df = pd.concat([surface_ak_df,df])\n",
    "    surface_ak_df = surface_ak_df.rename_axis('dt')\n",
    "    surface_ak_df.index = surface_ak_df.index.tz_localize('UTC').tz_convert(timezone)\n",
    "    surface_ak_dfs[inst_id] = surface_ak_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Write to parquet file\n",
    "surface_ak_df_concat = df_utils.concat_dict_of_dfs(surface_ak_dfs,drop_index=False)\n",
    "surface_ak_df_concat.to_parquet(os.path.join(project_data_path,'surface_ak_df_concat.parquet'),index = False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Met Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load meteorological data directly from the met files in the EM27 data path. \n",
    "\n",
    "met_type = 'ggg'\n",
    "mh = met_utils.MetHandler()\n",
    "met_dfs = {}\n",
    "\n",
    "for inst_id, details in inst_details.items():\n",
    "    for detail in details:\n",
    "        site = detail['site']\n",
    "        if (inst_id == 'ua') & (site == 'WBB') | (site == 'DBK'):\n",
    "            continue\n",
    "        print(f'Loading data for {inst_id} at {site}')\n",
    "        dtr = detail['dtr']\n",
    "        met_data_path = os.path.join(em27_data_path, inst_id, 'inst_data/met', site)\n",
    "        \n",
    "        #Load the data for the given datetime range\n",
    "        met_df = mh.load_stddata_in_range(met_type, met_data_path, dtr)\n",
    "        \n",
    "        met_df[['u','v']] = met_df.apply(lambda row: met_utils.wswd_to_uv(row['ws'],row['wd']),axis = 1, result_type = 'expand')\n",
    "\n",
    "        #Concatenate the dataframes for each site\n",
    "        if site not in met_dfs:\n",
    "            met_dfs[site] = met_df\n",
    "        else:\n",
    "            met_dfs[site] = pd.concat([met_dfs[site], met_df])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Write met data to parquet files\n",
    "met_dfs_concat = df_utils.concat_dict_of_dfs(met_dfs,drop_index=False)\n",
    "met_dfs_concat.to_parquet(os.path.join(project_data_path,'met_dfs_concat.parquet'),index = False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# HRRR Smoke"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Prior to running this notebook, the \"hrrr_smoke.py\" main script should be run to download and format the HRRR smoke data. This script will download the data for the specified date range and save it to a file named hrrr_smoke.parquet in the data folder. It can be run using the code in slurm/jobs/hrrr_smoke.slurm. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "smoke_df_fname = 'hrrr_smoke.parquet'\n",
    "smoke_df = pd.read_parquet(os.path.join(project_data_path, smoke_df_fname))\n",
    "\n",
    "\n",
    "# Calculate daily mean and max\n",
    "smoke_day_df = smoke_df.groupby(smoke_df.index.date).agg({\n",
    "    'tc_mdens': ['mean', 'max'],\n",
    "    'mdens': ['mean', 'max'],\n",
    "})\n",
    "# Flatten the MultiIndex columns\n",
    "smoke_day_df.columns = ['tc_mdens_mean', 'tc_mdens_max', 'mdens_mean', 'mdens_max']\n",
    "\n",
    "# Calculate statistics for mean values\n",
    "mean_tc_mdens = smoke_day_df['tc_mdens_mean'].mean()\n",
    "median_tc_mdens = smoke_day_df['tc_mdens_mean'].median()\n",
    "std_tc_mdens = smoke_day_df['tc_mdens_mean'].std()\n",
    "max_tc_mdens = smoke_day_df['tc_mdens_mean'].max()\n",
    "\n",
    "# Filter outliers based on mean values\n",
    "outliers_mean = smoke_day_df[smoke_day_df['tc_mdens_mean'] > (median_tc_mdens + std_tc_mdens)]\n",
    "outliers_max = smoke_day_df[smoke_day_df['tc_mdens_max'] > (median_tc_mdens + std_tc_mdens)]\n",
    "\n",
    "outlier_dates_mean = outliers_mean.index\n",
    "outlier_dates_max = outliers_max.index\n",
    "\n",
    "outliers = {\n",
    "    \"mean\": outlier_dates_mean.astype(str).tolist(),\n",
    "    \"max\": outlier_dates_max.astype(str).tolist(),\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save to json file\n",
    "with open(os.path.join(project_data_path,'smoke_outliers.json'), 'w') as handle:\n",
    "    json.dump(outliers, handle, indent=4, sort_keys=True, default=str)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "atmos1",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}