Read Files

PHOTO EMBED

Fri Aug 25 2023 01:04:33 GMT+0000 (Coordinated Universal Time)

Saved by @aadpaguio

def get_files(source_path, file_frmt, end_mth, n):
    '''
    source_path = source path
    file_ftm = datetime format of filename convention
    end_mth = year and month of run date in datetime 
    n = number of months

    Returns: list of files, list of mths of files
    '''
    # loop through the last n months of files available
    for mth in reversed(pd.date_range(end_mth-relativedelta(months=n), end_mth, freq='MS')):
        # check if filename exists 
        if os.path.exists(os.path.join(source_path, mth.strftime(file_frmt))):
            upper_lim = mth
            break
            
    # get the correct 12 month range of available files 
    mths = pd.date_range(upper_lim-relativedelta(months=n), upper_lim, freq='MS')
    files =  [mth.strftime(file_frmt) for mth in mths]
    return files, mths

#CHANGE FILE FORMATS
# Declare Run Month (upper limit of date scope)
run_date = '2023-06'
run_date_dt = pd.to_datetime(run_date)
run_date_dt

# Get ideal range of months
n_true = 12 # PLACE HERE NUMBER OF MONTHS
n = n_true - 1 # used for relativedelta
ideal_mths = pd.date_range(run_date_dt-relativedelta(months=n), run_date_dt, freq='MS')

usg_locpath = r"C:\Users\aadpaguio\bpi.com.ph\Business Transformation and Data Analytics - DS-BAU (Data Requests) - Documents\REGULAR DATA\ULC\usage"
usg_frmt = 'usage_%Y%m.parquet'

usg_files, usg_months = get_files(source_path = usg_locpath,
                                  file_frmt = usg_frmt,
                                  end_mth = run_date_dt,
                                  n = n
                                 )

usgg = []
csts = base_final.CST_ID.unique().tolist()
for file in tqdm(usg_files):
    print(file)
    usg = pd.read_parquet(os.path.join(usg_locpath, file))
    
    usg = (usg.reset_index().query('CST_ID in @csts')
            .drop_duplicates().fillna(0))
    
    usg['PERIOD'] = pd.to_datetime(file[-14:-8], format = '%Y%m').to_period(freq = 'M')
    
    

    garbc()
    usgg.append(usg)
    del usg
    garbc()
content_copyCOPY