Extract Tables from PDFs or website

PHOTO

Sun May 08 2022 13:36:33 GMT+0000 (Coordinated Universal Time)

# pip
pip install camelot-py
# conda
conda install -c conda-forge camelot-py
import camelot
tables = camelot.read_pdf('foo.pdf', pages='1', flavor='lattice')
print(tables)
tables.export('foo.csv', f='csv', compress=True)
tables[0].to_csv('foo.csv')  # to a csv file
print(tables[0].df)  # to a df


# from website
import pandas as pd
simpsons = pd.read_html('https://en.wikipedia.org/wiki/List_of_The_Simpsons_episodes_(seasons_1%E2%80%9320)')
# getting the first 5 rows of the table "Season 1" (second table)
simpsons[1].head()

COPY

https://medium.com/geekculture/automate-4-boring-tasks-in-python-with-5-lines-of-code-55901b3cd5dc

Save snippets that work from anywhere online with our extensions

Comments

python code

@QuinnFox12

.partition() (picking up piece of string between separators) findall and search text between 2 strings Combine columns in dataframe create and save dataframe to csv replace function in dataframe apply function and def Read filenames in folder Python regex related snippets create and save dataframe to csv mount Drive and colab Print function working with time series snippets Train Fasttext - GloVe algorithm on my own corpus Extract Tables from PDFs or website Remove blank cell in pandas dataframe time in pandas dataframe python vlookup dataframe dataframe all related

#python #direcotry #folder #file #creator

Folder creator

import pathlib
from pathlib import Path

def create_folder(path):
  if Path(path).is_dir():
    print ("Folder already exists!")
  else:
    pathlib.Path(path).mkdir(parents=True, exist_ok=True) 
    print ("Folder created!")    


FOLDER_PATH = '/content/drive/MyDrive/detect-covid19-xray/data-preprocessed'
create_folder(FOLDER_PATH)

#python #direcotry #folder #file #creator

Image data augmentation by using ImageDataGenerator

train_datagen = image.ImageDataGenerator(
  	rescale = 1./255,  # to normalize bigger values. Convert from 0-255 to 0-1 range.
    shear_range = 0.2,
    zoom_range = 0.2,
    horizontal_flip = True
)


# only rescaling done on test dataset
test_datagen = image.ImageDataGenerator(
    rescale = 1./255
)

train_generator = train_datagen.flow_from_directory(
    directory=TRAIN_PATH,
    target_size=(224,224),
    batch_size=32,
    class_mode='binary',
    save_to_dir = SAVE_TRAIN_PATH,
    save_prefix='',
    save_format='png'
)

validation_generator = test_datagen.flow_from_directory(
    VAL_PATH,
    target_size = (224,224),
    batch_size = 32,
    class_mode = 'binary'
)

# 
train_generator.class_indices
validation_generator.class_indices

# generate augmented images and save into the directory
for i in range(5):
  train_generator.next()

#python #direcotry #folder #file #creator

Directory creator

def create_dir(path, sub_dirs, label_dirs):
    for sub_dir in sub_dirs: 
        for label_dir in label_dirs:
            new_dir = os.path.join(path, sub_dir, label_dir)
            Path(new_dir).mkdir(parents=True, exist_ok=True)
            print(new_dir)
    
    print('All directories created successfully!')

FOLDER_PATH = 'dataset/'
SUB_DIRS = ['train/', 'test/']
LABEL_DIR = ['dogs/', 'cats/']

create_dir(FOLDER_PATH, SUB_DIRS, LABEL_DIR)

#folder

create multiple folder

from pathlib import Path
import calendar

month_names = list(calendar.month_name[1:])
days = ['Day 1', 'Day 8', 'Day 15', 'Day 22', 'Day 28']

for i, month in enumerate(month_names):
    for day in days:
        Path(f'2022/{i+1}.{month}/{day}').mkdir(parents=True, exist_ok=True)

#folder

Extract Tables from PDFs or website

# pip
pip install camelot-py
# conda
conda install -c conda-forge camelot-py
import camelot
tables = camelot.read_pdf('foo.pdf', pages='1', flavor='lattice')
print(tables)
tables.export('foo.csv', f='csv', compress=True)
tables[0].to_csv('foo.csv')  # to a csv file
print(tables[0].df)  # to a df


# from website
import pandas as pd
simpsons = pd.read_html('https://en.wikipedia.org/wiki/List_of_The_Simpsons_episodes_(seasons_1%E2%80%9320)')
# getting the first 5 rows of the table "Season 1" (second table)
simpsons[1].head()

#folder

multi json file to csv

json_list = [
    'a-1.json',
'a-2.json',
'a-3.json']

final_df = pd.DataFrame()

for i in json_list:
    try:
        df = pd.read_json(i)

        #df.to_csv(i+'.csv')
        
        df = pd.DataFrame(df)

        final_df = pd.concat([final_df,df])

    except:
        pass
        
final_df.to_csv('a1_a171_profiles.csv')

#python #removefiles #folder

Deleting all the files from the folder

for dirpath, dirnames, filenames in os.walk("./Test/"):
    # Remove regular files, ignore directories
    for filename in filenames:
        os.unlink(os.path.join(dirpath, filename))

Extract Tables from PDFs or website

Save snippets that work from anywhere online with our extensions

Comments

More like this

python code

Browse more snippets >>

Extract Tables from PDFs or website

Save snippets that work from anywhere online with our extensions

Comments

More like this

python code

Browse more snippets >>

Embed code snippet