Passagem entidades: bda->cloud - gerar queries

PHOTO

Thu Aug 26 2021 17:13:14 GMT+0000 (Coordinated Universal Time)

Saved by @Safrugans ##pyspark

# OPBEL list (Cmd 109 has more results)
opbels = ['315202770489', '598661400957', '353202709339', '541100470776', '598464587854', '346202685945', '324202802135', '598480916806', '319202885264', '312202947012', '352202678789', '562202490034', '561202413691', '560202556164', '361202640606', '598415420300', '598595733012', '397202687363', '546100419912', '349202590129', '507100471308', '598513872630', '584202511872', '359202650339', '507100461832', '378202554599', '344202620289', '308203017960', '388202532380', '359202676702', '120032627089', '598349885794', '598677760481', '368202692148', '562202522678', '344202556277', '143808896301', '531100481780', '318202931027', '537100450113', '507100461831', '599022029773', '599021918726', '321202819928', '143920059595', '572202510340', '341202699190', '387202583320', '367202788683', '145025288916', '598972753006', '398202788439', '300203182269', '549883569266', '387202708985']
####

df_bda = 'idc_ago_8_205_bda'
df_cloud = 'idc_ago_8_205'

# ensure same # cols on idc_ago_11_207_bda and idc_ago_11_207
cols = 'table_name, mandt, opbel, opupw, opupk, opupz, bukrs, gsber, bupla, segment, prctr, augst, gpart, vtref, vtpos, vtre2, vkont, abwbl, abwtp, abwkt, applk, hvorg, tvorg, kofiz, spart, hkont, mwskz, mwszkz, xanza, stakz, bldat, budat, optxt, waers, faedn, faeds, verkz, studt, sktpz, xmanl, kursf, betrh, betrw, betr2, betr3, skfbt, sbeth, sbetw, sbet2, sbet3, mwsko, mwvko, txrul, spzah, pymet, pybuk, pernr, grkey, persl, xaesp, augdt, augbl, augbd, augrd, augwa, augbt, augbs, augsk, augvd, augob, whang, whgrp, xeiph, mahnv, mansp, xaugp, abrzu, abrzo, fdgrp, fdlev, fdztg, fdwbt, xtaus, augrs, pygrp, pdtyp, sperz, infoz, txjcd, txdat, vbund, kontt, kontl, opsta, blart, emgpa, embvt, emadr, ikey, eurou, xragl, astkz, asblg, xblnr, inkps, rndps, qsskz, qssew, qsptp, qsshb, qbshb, qsznr, xwhex, rfupk, strkz, fitpr, xpyor, landl, intbu, emcrd, c4eye, c4eyp, sctax, sttax, stzal, orupz, negbu, subap, pswsl, pswbt, pswtx, psgrp, xcolc, aasta, xcsha, reacc, rebuk, emmnd, pnnum, pnhkf, pnexd, pnctr, finre, rdsta, rdstb, deaktiv, sgrkey, solldat, recpt, tocollect, einmalanf, vorauszahl, aperiodic, abrabs, grbbp, asmeth, int_crossrefno, ethppm, payfreqid, invoicing_party, ppmst, logno, aperiodict, add_refobj, add_refobjid, add_service, xustpd, ptitm, hora_val_comp, id_processo'


for opbel in opbels:
  
  # extract required keys
  keys = 'opbel, opupw, opupk, opupz'    ## replace "*" for desired cols - item0, item1,...
  
  df = spark.sql(f"select {keys} from( select  {cols} from {df_bda} minus select {cols} from {df_cloud}) as B where  opbel = {opbel}")
  df.show()

  values = df.distinct().collect()
  
  filter_cols = keys.split(",")
  item = []
  for i in range(len(keys.split(","))):
    item.append('')
    for value in values:
      item[i] += "'" + value[i] + "', "
    item[i] = item[i][0:-2]


  # build bda and cloud query
    ## fill in bda and cloud sources
  bda_sources = ['src_p16.dfkkop', ]
  cloud_sources = ['dfkkop_consolidated_08ago', ]
  
  filters = ''
  for i in range (len(filter_cols)):
    filters += f'\n\tand {filter_cols[i]} in ({item[i]})' 
  
  for bda_source in bda_sources:
    query = f"select * from  {bda_source}\n\twhere opbel = {opbel} {filters}"
    print("opbel: ", opbel, "\nquery bda:\n",query, "\n")
  
  for cloud_source in cloud_sources:
    query = f"select * from  {cloud_source}\n\twhere opbel = {opbel} {filters}"
    print("query cloud:\n",query, "\n")
  
    print(spark.sql(f"select count(*) from {query}").collect()[0], "\n\n")

Save snippets that work from anywhere online with our extensions

Available in the Chrome Web Store

Get Firefox Add-on

Get VS Code extension

Comments

More like this

EDP

@Safrugans

Consolidation - RowNumber first & last day of month Passagem entidades: bda->cloud - gerar queries Removes log files from spark write update oracle cols

Consolidation - RowNumber

windowSpec = W\
  .partitionBy(F.col("pos"))\
  .orderBy(
    F.col("pos").desc(), 
    F.col("op_ts").desc(), 
    F.col("current_ts").desc()
  	)

df = df\
  .withColumn(
  	"row_num", 
  	F.row_number().over(windowSpec)
	)\
  .filter(F.col("row_num") > '1')

Passagem entidades: bda->cloud - gerar queries

# OPBEL list (Cmd 109 has more results)
opbels = ['315202770489', '598661400957', '353202709339', '541100470776', '598464587854', '346202685945', '324202802135', '598480916806', '319202885264', '312202947012', '352202678789', '562202490034', '561202413691', '560202556164', '361202640606', '598415420300', '598595733012', '397202687363', '546100419912', '349202590129', '507100471308', '598513872630', '584202511872', '359202650339', '507100461832', '378202554599', '344202620289', '308203017960', '388202532380', '359202676702', '120032627089', '598349885794', '598677760481', '368202692148', '562202522678', '344202556277', '143808896301', '531100481780', '318202931027', '537100450113', '507100461831', '599022029773', '599021918726', '321202819928', '143920059595', '572202510340', '341202699190', '387202583320', '367202788683', '145025288916', '598972753006', '398202788439', '300203182269', '549883569266', '387202708985']
####

df_bda = 'idc_ago_8_205_bda'
df_cloud = 'idc_ago_8_205'

# ensure same # cols on idc_ago_11_207_bda and idc_ago_11_207
cols = 'table_name, mandt, opbel, opupw, opupk, opupz, bukrs, gsber, bupla, segment, prctr, augst, gpart, vtref, vtpos, vtre2, vkont, abwbl, abwtp, abwkt, applk, hvorg, tvorg, kofiz, spart, hkont, mwskz, mwszkz, xanza, stakz, bldat, budat, optxt, waers, faedn, faeds, verkz, studt, sktpz, xmanl, kursf, betrh, betrw, betr2, betr3, skfbt, sbeth, sbetw, sbet2, sbet3, mwsko, mwvko, txrul, spzah, pymet, pybuk, pernr, grkey, persl, xaesp, augdt, augbl, augbd, augrd, augwa, augbt, augbs, augsk, augvd, augob, whang, whgrp, xeiph, mahnv, mansp, xaugp, abrzu, abrzo, fdgrp, fdlev, fdztg, fdwbt, xtaus, augrs, pygrp, pdtyp, sperz, infoz, txjcd, txdat, vbund, kontt, kontl, opsta, blart, emgpa, embvt, emadr, ikey, eurou, xragl, astkz, asblg, xblnr, inkps, rndps, qsskz, qssew, qsptp, qsshb, qbshb, qsznr, xwhex, rfupk, strkz, fitpr, xpyor, landl, intbu, emcrd, c4eye, c4eyp, sctax, sttax, stzal, orupz, negbu, subap, pswsl, pswbt, pswtx, psgrp, xcolc, aasta, xcsha, reacc, rebuk, emmnd, pnnum, pnhkf, pnexd, pnctr, finre, rdsta, rdstb, deaktiv, sgrkey, solldat, recpt, tocollect, einmalanf, vorauszahl, aperiodic, abrabs, grbbp, asmeth, int_crossrefno, ethppm, payfreqid, invoicing_party, ppmst, logno, aperiodict, add_refobj, add_refobjid, add_service, xustpd, ptitm, hora_val_comp, id_processo'


for opbel in opbels:
  
  # extract required keys
  keys = 'opbel, opupw, opupk, opupz'    ## replace "*" for desired cols - item0, item1,...
  
  df = spark.sql(f"select {keys} from( select  {cols} from {df_bda} minus select {cols} from {df_cloud}) as B where  opbel = {opbel}")
  df.show()

  values = df.distinct().collect()
  
  filter_cols = keys.split(",")
  item = []
  for i in range(len(keys.split(","))):
    item.append('')
    for value in values:
      item[i] += "'" + value[i] + "', "
    item[i] = item[i][0:-2]


  # build bda and cloud query
    ## fill in bda and cloud sources
  bda_sources = ['src_p16.dfkkop', ]
  cloud_sources = ['dfkkop_consolidated_08ago', ]
  
  filters = ''
  for i in range (len(filter_cols)):
    filters += f'\n\tand {filter_cols[i]} in ({item[i]})' 
  
  for bda_source in bda_sources:
    query = f"select * from  {bda_source}\n\twhere opbel = {opbel} {filters}"
    print("opbel: ", opbel, "\nquery bda:\n",query, "\n")
  
  for cloud_source in cloud_sources:
    query = f"select * from  {cloud_source}\n\twhere opbel = {opbel} {filters}"
    print("query cloud:\n",query, "\n")
  
    print(spark.sql(f"select count(*) from {query}").collect()[0], "\n\n")

Removes log files from spark write

%sql 
SET spark.sql.sources.commitProtocolClass=org.apache.spark.sql.execution.datasources.SQLHadoopMapReduceCommitProtocol


spark.conf.set("mapreduce.fileoutputcommitter.marksuccessfuljobs","false")

first & last day of month

import datetime

ref_date = "2021-02-7"

ref_date_date = datetime.datetime.strptime(ref_date, '%Y-%m-%d')

print(ref_date)

firstday_refmonth = ref_date_date.replace(day=1)

next_month = ref_date_date.replace(day=28) + datetime.timedelta(days=4)
lastday_refmonth = next_month - datetime.timedelta(days=next_month.day) 

firstday_refmonth_str = datetime.datetime.strftime(firstday_refmonth, '%Y-%m-%d')
lastday_refmonth_str = datetime.datetime.strftime(lastday_refmonth, '%Y-%m-%d')

print(firstday_refmonth_str)
print(lastday_refmonth_str)

##########################################

import datetime
from dateutil.relativedelta import relativedelta

def get_days(ref_date):
  ref_date_date = datetime.datetime.strptime(ref_date, '%Y-%m-%d')


  date = ref_date_date + datetime.timedelta(days=1)
  date = ref_date_date + relativedelta(months=-1)
  

  firstday_refmonth = date.replace(day=1)

  next_month = date.replace(day=28) + datetime.timedelta(days=4)
  lastday_refmonth = next_month - datetime.timedelta(days=next_month.day) 

  first_day = datetime.datetime.strftime(firstday_refmonth, '%Y-%m-%d')
  last_day = datetime.datetime.strftime(lastday_refmonth, '%Y-%m-%d')

  return first_day, last_day


get_days("2021-02-01")

Browse more snippets >>