Passagem entidades: bda->cloud - gerar queries

PHOTO EMBED

Thu Aug 26 2021 17:13:14 GMT+0000 (Coordinated Universal Time)

Saved by @Safrugans ##pyspark

# OPBEL list (Cmd 109 has more results)
opbels = ['315202770489', '598661400957', '353202709339', '541100470776', '598464587854', '346202685945', '324202802135', '598480916806', '319202885264', '312202947012', '352202678789', '562202490034', '561202413691', '560202556164', '361202640606', '598415420300', '598595733012', '397202687363', '546100419912', '349202590129', '507100471308', '598513872630', '584202511872', '359202650339', '507100461832', '378202554599', '344202620289', '308203017960', '388202532380', '359202676702', '120032627089', '598349885794', '598677760481', '368202692148', '562202522678', '344202556277', '143808896301', '531100481780', '318202931027', '537100450113', '507100461831', '599022029773', '599021918726', '321202819928', '143920059595', '572202510340', '341202699190', '387202583320', '367202788683', '145025288916', '598972753006', '398202788439', '300203182269', '549883569266', '387202708985']
####

df_bda = 'idc_ago_8_205_bda'
df_cloud = 'idc_ago_8_205'

# ensure same # cols on idc_ago_11_207_bda and idc_ago_11_207
cols = 'table_name, mandt, opbel, opupw, opupk, opupz, bukrs, gsber, bupla, segment, prctr, augst, gpart, vtref, vtpos, vtre2, vkont, abwbl, abwtp, abwkt, applk, hvorg, tvorg, kofiz, spart, hkont, mwskz, mwszkz, xanza, stakz, bldat, budat, optxt, waers, faedn, faeds, verkz, studt, sktpz, xmanl, kursf, betrh, betrw, betr2, betr3, skfbt, sbeth, sbetw, sbet2, sbet3, mwsko, mwvko, txrul, spzah, pymet, pybuk, pernr, grkey, persl, xaesp, augdt, augbl, augbd, augrd, augwa, augbt, augbs, augsk, augvd, augob, whang, whgrp, xeiph, mahnv, mansp, xaugp, abrzu, abrzo, fdgrp, fdlev, fdztg, fdwbt, xtaus, augrs, pygrp, pdtyp, sperz, infoz, txjcd, txdat, vbund, kontt, kontl, opsta, blart, emgpa, embvt, emadr, ikey, eurou, xragl, astkz, asblg, xblnr, inkps, rndps, qsskz, qssew, qsptp, qsshb, qbshb, qsznr, xwhex, rfupk, strkz, fitpr, xpyor, landl, intbu, emcrd, c4eye, c4eyp, sctax, sttax, stzal, orupz, negbu, subap, pswsl, pswbt, pswtx, psgrp, xcolc, aasta, xcsha, reacc, rebuk, emmnd, pnnum, pnhkf, pnexd, pnctr, finre, rdsta, rdstb, deaktiv, sgrkey, solldat, recpt, tocollect, einmalanf, vorauszahl, aperiodic, abrabs, grbbp, asmeth, int_crossrefno, ethppm, payfreqid, invoicing_party, ppmst, logno, aperiodict, add_refobj, add_refobjid, add_service, xustpd, ptitm, hora_val_comp, id_processo'


for opbel in opbels:
  
  # extract required keys
  keys = 'opbel, opupw, opupk, opupz'    ## replace "*" for desired cols - item0, item1,...
  
  df = spark.sql(f"select {keys} from( select  {cols} from {df_bda} minus select {cols} from {df_cloud}) as B where  opbel = {opbel}")
  df.show()

  values = df.distinct().collect()
  
  filter_cols = keys.split(",")
  item = []
  for i in range(len(keys.split(","))):
    item.append('')
    for value in values:
      item[i] += "'" + value[i] + "', "
    item[i] = item[i][0:-2]


  # build bda and cloud query
    ## fill in bda and cloud sources
  bda_sources = ['src_p16.dfkkop', ]
  cloud_sources = ['dfkkop_consolidated_08ago', ]
  
  filters = ''
  for i in range (len(filter_cols)):
    filters += f'\n\tand {filter_cols[i]} in ({item[i]})' 
  
  for bda_source in bda_sources:
    query = f"select * from  {bda_source}\n\twhere opbel = {opbel} {filters}"
    print("opbel: ", opbel, "\nquery bda:\n",query, "\n")
  
  for cloud_source in cloud_sources:
    query = f"select * from  {cloud_source}\n\twhere opbel = {opbel} {filters}"
    print("query cloud:\n",query, "\n")
  
    print(spark.sql(f"select count(*) from {query}").collect()[0], "\n\n")
content_copyCOPY