Read Files
Fri Aug 25 2023 01:04:33 GMT+0000 (Coordinated Universal Time)
Saved by @aadpaguio
def get_files(source_path, file_frmt, end_mth, n): ''' source_path = source path file_ftm = datetime format of filename convention end_mth = year and month of run date in datetime n = number of months Returns: list of files, list of mths of files ''' # loop through the last n months of files available for mth in reversed(pd.date_range(end_mth-relativedelta(months=n), end_mth, freq='MS')): # check if filename exists if os.path.exists(os.path.join(source_path, mth.strftime(file_frmt))): upper_lim = mth break # get the correct 12 month range of available files mths = pd.date_range(upper_lim-relativedelta(months=n), upper_lim, freq='MS') files = [mth.strftime(file_frmt) for mth in mths] return files, mths #CHANGE FILE FORMATS # Declare Run Month (upper limit of date scope) run_date = '2023-06' run_date_dt = pd.to_datetime(run_date) run_date_dt # Get ideal range of months n_true = 12 # PLACE HERE NUMBER OF MONTHS n = n_true - 1 # used for relativedelta ideal_mths = pd.date_range(run_date_dt-relativedelta(months=n), run_date_dt, freq='MS') usg_locpath = r"C:\Users\aadpaguio\bpi.com.ph\Business Transformation and Data Analytics - DS-BAU (Data Requests) - Documents\REGULAR DATA\ULC\usage" usg_frmt = 'usage_%Y%m.parquet' usg_files, usg_months = get_files(source_path = usg_locpath, file_frmt = usg_frmt, end_mth = run_date_dt, n = n ) usgg = [] csts = base_final.CST_ID.unique().tolist() for file in tqdm(usg_files): print(file) usg = pd.read_parquet(os.path.join(usg_locpath, file)) usg = (usg.reset_index().query('CST_ID in @csts') .drop_duplicates().fillna(0)) usg['PERIOD'] = pd.to_datetime(file[-14:-8], format = '%Y%m').to_period(freq = 'M') garbc() usgg.append(usg) del usg garbc()
Comments