Download CMIP6 monthly data from AWS S3
Tue Nov 14 2023 18:07:30 GMT+0000 (Coordinated Universal Time)
Saved by @diptish
import xarray as xr from netCDF4 import Dataset import h5py import numpy as np from multiprocessing import Pool import s3fs import time import pandas as pd import json import os import sys import time import subprocess as sp import os # Connect to AWS S3 storage fs = s3fs.S3FileSystem(anon=True) df = pd.read_csv("https://cmip6-pds.s3.amazonaws.com/pangeo-cmip6.csv") proc = 40 def getData(query:dict): ''' Load AWS CMIP6 data into xarray dataframe query (dict or str) - dict or str with data information - if dict format as {'param':'value','param2':['val1','val2']} ''' # Create query string for pandas.DataFrame.query if type(query) is dict: inputStr = " & ".join(["{0}=='{1}'".format(param, query[param]) for param in query]) elif type(query) is str: # if its already a string, pass through inputStr=query # Searches cmip6 data csv for datasets that match given parameters df_subset = df.query(inputStr) if df_subset.empty: 1+1 print('data not available for '+inputStr) else: # load data for v in df_subset.zstore.values: zstore = v mapper = fs.get_mapper(zstore) ### open_zarr, so datasets are not loaded yet return_ds = xr.open_zarr(mapper, consolidated=True) return(return_ds) def queryData(query:dict): ''' query data availaibility ''' # Create query string for pandas.DataFrame.query if type(query) is dict: inputStr = " & ".join(["{0}=='{1}'".format(param, query[param]) for param in query]) elif type(query) is str: # if its already a string, pass through inputStr=query # Searches cmip6 data csv for datasets that match given parameters df_subset = df.query(inputStr) if df_subset.empty: #print('data is not available for '+inputStr) flag=False else: flag=True return(flag) modelList=['CESM2'] #modelList=[ 'ACCESS-ESM1-5', 'AWI-CM-1-1-MR', 'AWI-ESM-1-1-LR', 'BCC-CSM2-MR', 'BCC-ESM1', 'CAS-ESM2-0', 'CESM2', 'CESM2-FV2', 'CESM2-WACCM', 'CESM2-WACCM-FV2', 'CMCC-CM2-HR4', 'CMCC-CM2-SR5', 'CMCC-ESM2', 'CNRM-CM6-1', 'CNRM-CM6-1-HR', 'CNRM-ESM2-1', 'CanESM5-CanOE', 'E3SM-1-0', 'E3SM-1-1', 'E3SM-1-1-ECA', 'EC-Earth3', 'EC-Earth3-AerChem', 'EC-Earth3-Veg', 'EC-Earth3-Veg-LR', 'FGOALS-f3-L', 'FGOALS-g3', 'FIO-ESM-2-0', 'GFDL-CM4', 'GFDL-ESM4','GISS-E2-1-G-CC', 'HadGEM3-GC31-LL', 'HadGEM3-GC31-MM', 'IITM-ESM', 'INM-CM4-8', 'INM-CM5-0', 'IPSL-CM6A-LR', 'KACE-1-0-G', 'KIOST-ESM', 'MIROC-ES2L', 'MIROC6', 'MPI-ESM-1-2-HAM', 'MPI-ESM1-2-HR', 'MPI-ESM1-2-LR', 'MRI-ESM2-0', 'NESM3', 'NorESM2-MM', 'SAM0-UNICON', 'TaiESM1', 'UKESM1-0-LL'] experiment='historical' table='SImon' ## 'Amon' membList=['r1i1p1f1','r2i1p1f1','r3i1p1f1','r4i1p1f1','r5i1p1f1', 'r6i1p1f1','r7i1p1f1','r8i1p1f1','r9i1p1f1','r10i1p1f1','r11i1p1f1'] #varList=['pr','rlut','rsdt','rsut'] varList=['siconca'] for model in modelList: #data = [] dict_query = {'source_id':model,'table_id':table,'experiment_id':experiment} for var in varList: dict_query['variable_id'] = var for member in membList: dict_query['member_id'] = member f=queryData(dict_query) #print(f) if f : print(var) data=getData(dict_query) membOut=member reData=regrid(data) #print(data) out_path= '/Data/{0}_{1}_{2}_siconca.nc'.format(model, membOut, experiment) reData.to_netcdf(path=out_path,mode='w',format='NETCDF4')
Comments