import xarray as xr
from netCDF4 import Dataset
import h5py
import numpy as np
from multiprocessing import Pool
import s3fs
import time
import pandas as pd
import json
import os
import sys
import time
import subprocess as sp
import os
# Connect to AWS S3 storage
fs = s3fs.S3FileSystem(anon=True)
df = pd.read_csv("https://cmip6-pds.s3.amazonaws.com/pangeo-cmip6.csv")
proc = 40
def getData(query:dict):
'''
Load AWS CMIP6 data into xarray dataframe
query (dict or str) - dict or str with data information
- if dict format as {'param':'value','param2':['val1','val2']}
'''
# Create query string for pandas.DataFrame.query
if type(query) is dict:
inputStr = " & ".join(["{0}=='{1}'".format(param, query[param]) for param in query])
elif type(query) is str: # if its already a string, pass through
inputStr=query
# Searches cmip6 data csv for datasets that match given parameters
df_subset = df.query(inputStr)
if df_subset.empty:
1+1
print('data not available for '+inputStr)
else:
# load data
for v in df_subset.zstore.values:
zstore = v
mapper = fs.get_mapper(zstore)
### open_zarr, so datasets are not loaded yet
return_ds = xr.open_zarr(mapper, consolidated=True)
return(return_ds)
def queryData(query:dict):
'''
query data availaibility
'''
# Create query string for pandas.DataFrame.query
if type(query) is dict:
inputStr = " & ".join(["{0}=='{1}'".format(param, query[param]) for param in query])
elif type(query) is str: # if its already a string, pass through
inputStr=query
# Searches cmip6 data csv for datasets that match given parameters
df_subset = df.query(inputStr)
if df_subset.empty:
#print('data is not available for '+inputStr)
flag=False
else:
flag=True
return(flag)
modelList=['CESM2']
#modelList=[ 'ACCESS-ESM1-5', 'AWI-CM-1-1-MR', 'AWI-ESM-1-1-LR', 'BCC-CSM2-MR', 'BCC-ESM1', 'CAS-ESM2-0', 'CESM2', 'CESM2-FV2', 'CESM2-WACCM', 'CESM2-WACCM-FV2', 'CMCC-CM2-HR4', 'CMCC-CM2-SR5', 'CMCC-ESM2', 'CNRM-CM6-1', 'CNRM-CM6-1-HR', 'CNRM-ESM2-1', 'CanESM5-CanOE', 'E3SM-1-0', 'E3SM-1-1', 'E3SM-1-1-ECA', 'EC-Earth3', 'EC-Earth3-AerChem', 'EC-Earth3-Veg', 'EC-Earth3-Veg-LR', 'FGOALS-f3-L', 'FGOALS-g3', 'FIO-ESM-2-0', 'GFDL-CM4', 'GFDL-ESM4','GISS-E2-1-G-CC', 'HadGEM3-GC31-LL', 'HadGEM3-GC31-MM', 'IITM-ESM', 'INM-CM4-8', 'INM-CM5-0', 'IPSL-CM6A-LR', 'KACE-1-0-G', 'KIOST-ESM', 'MIROC-ES2L', 'MIROC6', 'MPI-ESM-1-2-HAM', 'MPI-ESM1-2-HR', 'MPI-ESM1-2-LR', 'MRI-ESM2-0', 'NESM3', 'NorESM2-MM', 'SAM0-UNICON', 'TaiESM1', 'UKESM1-0-LL']
experiment='historical'
table='SImon' ## 'Amon'
membList=['r1i1p1f1','r2i1p1f1','r3i1p1f1','r4i1p1f1','r5i1p1f1',
'r6i1p1f1','r7i1p1f1','r8i1p1f1','r9i1p1f1','r10i1p1f1','r11i1p1f1']
#varList=['pr','rlut','rsdt','rsut']
varList=['siconca']
for model in modelList:
#data = []
dict_query = {'source_id':model,'table_id':table,'experiment_id':experiment}
for var in varList:
dict_query['variable_id'] = var
for member in membList:
dict_query['member_id'] = member
f=queryData(dict_query)
#print(f)
if f :
print(var)
data=getData(dict_query)
membOut=member
reData=regrid(data)
#print(data)
out_path= '/Data/{0}_{1}_{2}_siconca.nc'.format(model, membOut, experiment)
reData.to_netcdf(path=out_path,mode='w',format='NETCDF4')