def get_files(source_path, file_frmt, end_mth, n):
'''
source_path = source path
file_ftm = datetime format of filename convention
end_mth = year and month of run date in datetime
n = number of months
Returns: list of files, list of mths of files
'''
# loop through the last n months of files available
for mth in reversed(pd.date_range(end_mth-relativedelta(months=n), end_mth, freq='MS')):
# check if filename exists
if os.path.exists(os.path.join(source_path, mth.strftime(file_frmt))):
upper_lim = mth
break
# get the correct 12 month range of available files
mths = pd.date_range(upper_lim-relativedelta(months=n), upper_lim, freq='MS')
files = [mth.strftime(file_frmt) for mth in mths]
return files, mths
#CHANGE FILE FORMATS
# Declare Run Month (upper limit of date scope)
run_date = '2023-06'
run_date_dt = pd.to_datetime(run_date)
run_date_dt
# Get ideal range of months
n_true = 12 # PLACE HERE NUMBER OF MONTHS
n = n_true - 1 # used for relativedelta
ideal_mths = pd.date_range(run_date_dt-relativedelta(months=n), run_date_dt, freq='MS')
usg_locpath = r"C:\Users\aadpaguio\bpi.com.ph\Business Transformation and Data Analytics - DS-BAU (Data Requests) - Documents\REGULAR DATA\ULC\usage"
usg_frmt = 'usage_%Y%m.parquet'
usg_files, usg_months = get_files(source_path = usg_locpath,
file_frmt = usg_frmt,
end_mth = run_date_dt,
n = n
)
usgg = []
csts = base_final.CST_ID.unique().tolist()
for file in tqdm(usg_files):
print(file)
usg = pd.read_parquet(os.path.join(usg_locpath, file))
usg = (usg.reset_index().query('CST_ID in @csts')
.drop_duplicates().fillna(0))
usg['PERIOD'] = pd.to_datetime(file[-14:-8], format = '%Y%m').to_period(freq = 'M')
garbc()
usgg.append(usg)
del usg
garbc()