thiscodeWorks - Organizing the best of code online

##python #coding #python #programming #word #doc #csv #pandas

how to convert csv file to word file in Python

from docx import Document
import pandas as pd

df = pd.read_csv('out.csv').to_dict()
word_doc = Document()

table_head = list(df.keys())

table_head_len = len(table_head)
tables_rows_len = len(df['name'])

# create tabel
table = word_doc.add_table(cols = table_head_len, rows = tables_rows_len)

# add the first row in the table
for i in range(table_head_len):
    table.cell(row_idx = 0, col_idx = i).text = table_head[i]


# add rows for name col
for i in range(1, tables_rows_len):
    table.cell(row_idx = i, col_idx = 0).text = df['name'][i]


# add rows for age col
for i in range(1, tables_rows_len):
    table.cell(row_idx = i, col_idx = 1).text = str(df['age'][i])


word_doc.save('word_doc.docx')

#python #pandas #businesanalytics

Importar paquetes propios

from sys import path
import os
path.append(os.path.realpath('../'))

#python #fastapi #pandas

fastapi: dataframe to download

from fastapi import FastAPI
from fastapi.responses import StreamingResponse
import io
import pandas as pd

app = FastAPI()

@app.get("/get_csv")
async def get_csv():
    df = pd.DataFrame(dict(col1 = 1, col2 = 2), index=[0])
    stream = io.StringIO()
    df.to_csv(stream, index = False)
    response = StreamingResponse(iter([stream.getvalue()]),
                                 media_type="text/csv"
                                )
    response.headers["Content-Disposition"] = "attachment; filename=export.csv"
    return response

#python #pandas

Replace a value in a Dataframe with NaN

# Assume -9999 is a missing data flag
df.replace(-9999, np.NaN)
# Alternatively, perform this when creating the dataframe:
pd.read_csv("https://www.atmos.albany.edu/products/metarCSV/world_metar_latest.csv", sep='\s+',na_values=['9999.00','-9999.0'])

#pandas #profile

Pandas Profile

pip install pandas-profiling

from pandas_profiling import ProfileReport
profile = ProfileReport(master_data, title="OMI v3.5 Profile Report")
profile.to_file("OMIv3.5_profilereport.html")

#pandas #python #unique #dataframe

Unique Values in Pandas

df['column'].unique()  # Unique Values
df['column'].nunique() # Number of unique values
df['column'].value_counts() # Both

#python #pandas

Filtering for selected items

#Post prepare_data_for_forecast
input_selected_items_df = pd.read_excel('item_list_2023-05-02.xlsx')
target_df = target_df[target_df['ITEM_ID'].isin(input_selected_items_df['ITEM_ID'])]

#python #pandas

Pandas options during import

import pandas as pd

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None) 
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

#pandas #dataframe

Concate dataframe in for loop

df = pd.DataFrame()
for urls in urls_list:
	df = pd.concat([df,df_new])

#python #pandas

Lambda function for parsing datetime objects in a dataframe

def parsed_df(**kwargs):
    # First define the format and then define the function
    timeFormat = "%Y-%m-%d %H:%M:%S UTC"
    # This function will iterate over each string in a 1-d array
    # and use Pandas' implementation of strptime to convert the string into a datetime object.
    parseTime = lambda x: datetime.strptime(x, timeFormat)
    return pd.read_csv('/path/to/file.csv',parse_dates=['time'], date_parser=parseTime).set_index('time')

#python #pandas #binning

Binning and bucketizing

port matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# creating a dictionary
sns.set_style("whitegrid")
plt.rc('font', size=16) #controls default text size
plt.rc('axes', titlesize=16) #fontsize of the title
plt.rc('axes', labelsize=16) #fontsize of the x and y labels
plt.rc('xtick', labelsize=16) #fontsize of the x tick labels
plt.rc('ytick', labelsize=16) #fontsize of the y tick labels
plt.rc('legend', fontsize=16) #fontsize of the legend

# load dataset - census income
census_income = pd.read_csv(r'../input/income/train.csv')

# define figure
fig, (ax1, ax2) = plt.subplots(2)
fig.set_size_inches(18.5, 10.5)

# plot age histogram
age_count = census_income.groupby(by=["age"])["age"].count()
ax1.bar(age_count.index, age_count, color='black')
ax1.set_ylabel("Counts")
ax1.set_xlabel("Age")

# binning age
def age_bins(age):
    if age < 29:
        return "1 - young"
    if age < 60 and age >= 29:
        return "2 - middle-aged"
    else:
        return "3 - old-aged"

# apply trans. function
census_income["age_bins"] = census_income["age"].apply(age_bins)

# group and count all entries in the same bin
age_bins_df = census_income.groupby(by=["age_bins"])["age_bins"].count()

ax2.bar(age_bins_df.index, age_bins_df, color='grey')
ax2.set_ylabel("Counts")
ax2.set_xlabel("Age")

#python #pandas #cost #benefit #sensitivity

Exploratory Data Analysis with Dataprep

import pandas as pd
from dataprep.eda import create_report
df = pd.read_csv("parking_violations.csv")
create_report(df)

#Plot 
plot(df)
plot_correlation(df)

#plot correlation
plot_correlation(df, col1, col2)



#load dataset houses, titanic
from dataprep.datasets import load_dataset

#plot diff training and test data
from dataprep.datasets import load_dataset
import numpy as np
df1 = load_dataset("house_prices_train")
df1 = df1.replace(" ?", np.NaN)
df2 = load_dataset("house_prices_test")
df2 = df2.replace(" ?", np.NaN)

#save Report
from dataprep.eda import create_report
report = create_report(df, title='My Report')
report.save(filename='report_01', to='~/Desktop')

#pandas

Replace Male:0 Female:1 in DataFrame (sex)

# creating dummies for gender
data['Gender'] = data['Gender'].map({'Male':0, 'Female':1})

#pandas #groupby

Linear Regression - Univariate regression - Statsmodels

import statsmodels.formula.api as smf

model1 = smf.ols('review_score ~ wait_time', data=orders).fit()
print(model1.summary())

#pandas #groupby

Cheat Sheet for map, apply, applymap and groupby PANDAS

## MAP (for Series)
series.map(function) 
Series.map({mapping dict})

#add a column to your df operating on each row
# Let's call this "custom_sum" as "sum" is a built-in function
def custom_sum(row):
    return row.sum()
df['D'] = df.apply(custom_sum, axis=1)

#add as row
df.loc['Row 5'] = df.apply(custom_sum, axis=0)

## APPLY (for DataFrame)
df.apply(lambda col: col.max(), axis = 0)     # default axis
df.apply(lambda row: row[‘A’] + row[‘B’], axis = 1)
df.applymap(my_funct_for_indiv_elements)
    df.applymap(lambda x: '%.2f' % x)

## GROUPBY
group = df.groupby('col_A')
group.mean()
group.apply(np.mean)
group.agg({
    col_A: ['mean', np.sum],
    col_B: my_custom_sum,
    col_B: lambda s: my_custom_sum(s)
    })

group.apply(custom_mean_function)

#pandas #groupby

dataframe group by

df.groupby('name')['n'].sum().plot(kind='bar', x='name')

#pandas #datetime

Pandas - days since date (time since event)

df['TIMESINCE'] = (pd.Timestamp.today() - df['REGDATE']).dt.days
df['YEARSSINCE'] = df['timesince'] / 365
df

#python #pandas

Convert the column type from string to datetime format in Pandas dataframe - GeeksforGeeks

# convert the 'Date' column to datetime format
df['Date']= pd.to_datetime(df['Date'])
 
# Check the format of 'Date' column
df.info()

#python #pandas

Filter Pandas dataframe by column name on regex patterns using str.contains

df[df.columns[df.columns.str.contains(pat = 'WORDABC9')]]
   WORDABC9N123  WORDABC99N123
0            13             16
1            14             17
2            15             18

df[df.columns[df.columns.str.contains(pat = 'WORDABC9\\b')]]
Empty DataFrame
Columns: []
Index: [0, 1, 2]

#python #pandas #excel #combineexcelsheets

Combine multiple excel sheets of the same file in one dataframe and save it as .csv

import pandas pd

excel_file = pd.read_excel(‘file.xlsx’, sheet_name=None)
dataset_combined = pd.concat(excel_file.values())

#python #pandas #dataset #eda #extractyear #datetime

Extract the year from a date data In pandas

df['year'] = df['date'].dt.year if we put dt.month is for month etc..
df.head()

#python #pandas #dataset #eda #outliers #boundaries

Function to find boundaries for outliers

def find_boundaries(df, variable, distance=1.5):

    IQR = df[variable].quantile(0.75) - df[variable].quantile(0.25)

    lower_boundary = df[variable].quantile(0.25) - (IQR * distance)
    upper_boundary = df[variable].quantile(0.75) + (IQR * distance)

    return upper_boundary, lower_boundary

#python #pandas #dataset #eda #cardibality

Quantifying cardinality In datasets

data.nunique().plot.bar(figsize=(12,6))
plt.ylabel('Number of unique categories')
plt.xlabel('Variables')
plt.title('Cardinality')

## Version with 5% threshold

fig = label_freq.sort_values(ascending=False).plot.bar()
fig.axhline(y=0.05, color='red')
fig.set_ylabel('percentage of cars within each category')
fig.set_xlabel('Variable: class')
fig.set_title('Identifying Rare Categories')
plt.show()

#python #pandas #dataset #eda #missingdata

Quantifying Missing data with charts

data.isnull().mean().plot.bar(figsize=(12,6))
plt.ylabel('Percentage of missing values')
plt.xlabel('Variables')
plt.title('Quantifying missing data')

#python #pandas #dataset #numerical #categorical #eda

Printing, listing and counting the numerical or categorical features in a dataset Pandas

# Get the Numerical Data list to infer distribution plots

numerical = [var for var in df.columns if df[var].dtype!='O'] 
print('There are {} numerical variables\n'.format(len(numerical))) 
print('The numerical variables are :', numerical)

# Get the Categorical Data list to infer distribution plots

categorical = [var for var in df.columns if df[var].dtype =='O'] 
print('There are {} Categorical variables\n'.format(len(categorical))) 
print('The Categorical variables are :', categorical)

#python #pandas #label #mapping

Pandas Map function to provide labels to categorical/text labels

#Add the new column which gives a unique number to each of these labels 

df['label_num'] = df['label'].map({
    'Household' : 0, 
    'Books': 1, 
    'Electronics': 2, 
    'Clothing & Accessories': 3
})

#checking the results 
df.head(5)

#python #pandas #dataframe #crossjoin

Cross Join in Pandas

# Setup fake key in both DataFrames to join on it

df1['key'] = 0
df2['key'] = 0

df1.merge(df2, on='key', how='outer')

#python #pandas #datacleaning #machinelearning #pandas_profiling #save_to_html_file

قراءة وحفط الداتاسيت في ملف html

from pandas_profiling import ProfileReport
import pandas as pd

test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')
train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')

profile = ProfileReport(train, title="train Pandas Profiling Report")
profile.to_file("train_report.html")
profile = ProfileReport(test, title="test Pandas Profiling Report")
profile.to_file("test_report.html")

#pandas #list #group

Train Fasttext - GloVe algorithm on my own corpus

#You can do it using GloVe library:

#Install it: 

!pip install glove_python

from glove import Corpus, Glove

#Creating a corpus object
corpus = Corpus() 

#Training the corpus to generate the co-occurrence matrix which is used in GloVe
corpus.fit(lines, window=10)

glove = Glove(no_components=5, learning_rate=0.05) 
glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
glove.add_dictionary(corpus.dictionary)
glove.save('glove.model')
 Save

 
 
 #for Fasttext
 from gensim.models import FastText
from gensim.test.utils import common_texts  # some example sentences
>>>
print(common_texts[0])
['human', 'interface', 'computer']
print(len(common_texts))
9
model = FastText(vector_size=4, window=3, min_count=1)  # instantiate
model.build_vocab(sentences=common_texts)
model.train(sentences=common_texts, total_examples=len(common_texts), epochs=10)  # train
model2 = FastText(vector_size=4, window=3, min_count=1, sentences=common_texts, epochs=10)

import numpy as np
>>>
np.allclose(model.wv['computer'], model2.wv['computer'])
True


from gensim.test.utils import datapath
>>>
corpus_file = datapath('lee_background.cor')  # absolute path to corpus
model3 = FastText(vector_size=4, window=3, min_count=1)
model3.build_vocab(corpus_file=corpus_file)  # scan over corpus to build the vocabulary
>>>
total_words = model3.corpus_total_words  # number of words in the corpus
model3.train(corpus_file=corpus_file, total_words=total_words, epochs=5)


from gensim.utils import tokenize
from gensim import utils
>>>
>>>
class MyIter:
    def __iter__(self):
        path = datapath('crime-and-punishment.txt')
        with utils.open(path, 'r', encoding='utf-8') as fin:
            for line in fin:
                yield list(tokenize(line))
>>>
>>>
model4 = FastText(vector_size=4, window=3, min_count=1)
model4.build_vocab(sentences=MyIter())
total_examples = model4.corpus_count
model4.train(sentences=MyIter(), total_examples=total_examples, epochs=5)
from gensim.test.utils import get_tmpfile
>>>
fname = get_tmpfile("fasttext.model")
>>>
model.save(fname)
model = FastText.load(fname)


# https://radimrehurek.com/gensim/models/fasttext.html

#pandas #list #group

group by the first column and get second column as lists in rows:

In [1]: df = pd.DataFrame( {'a':['A','A','B','B','B','C'], 'b':[1,2,5,5,4,6]})
        df

Out[1]: 
   a  b
0  A  1
1  A  2
2  B  5
3  B  5
4  B  4
5  C  6

In [2]: df.groupby('a')['b'].apply(list)
Out[2]: 
a
A       [1, 2]
B    [5, 5, 4]
C          [6]
Name: b, dtype: object

In [3]: df1 = df.groupby('a')['b'].apply(list).reset_index(name='new')
        df1
Out[3]: 
   a        new
0  A     [1, 2]
1  B  [5, 5, 4]
2  C        [6]

#python #pandas

Create a copy of a Pandas Dataframe with selected columns; write out to csv

df2 = df1.filter(['Col 1', 'Col 2'], axis=1)
df2.write_csv('output.csv', index=False) # do not create a separate column index

#html #pandas #python

convert value_counts() to dataframe | pandas | python

audio_meta["duration"].astype(np.int8).value_counts().rename_axis('duration').reset_index(name='counts')

#pandas #gcp

gcp read data from bucket

import pandas as pd
from google.cloud import storage

BUCKET_NAME = 'zhibo-work'

# Create a Cloud Storage client to download the data
storage_client = storage.Client()

# Download the data
data_bucket = storage_client.bucket(BUCKET_NAME)
blob = data_bucket.blob('description/data_gics.csv')
blob.download_to_filename('data_gics.csv')

#python #pandas #dataframe

brondata naar pandas dataframe

def load_table_to_df(fp, **kwargs):

    if fp.endswith('.xls'):
        result_df = pd.read_excel(fp, **kwargs)
        
    elif fp.endswith('.xlsx'):
        result_df = pd.read_excel(fp, engine='openpyxl', **kwargs)
        
    elif fp.endswith('.csv') or fp.endswith('.txt'):
        result_df = pd.read_csv(fp, **kwargs)
        
    elif fp.endswith('.parquet'):
        result_df = pd.read_parquet(fp, **kwargs)
        
    else:
        return "Wrong file extension"

    print('     -->  Succesfully loaded {}'.format(fp))
    return result_df

#python #pandas #dataframe

Adding new column to a dataframe

lins = pd.Series(lines, name='img_path')
train_df = pd.concat([train_df, lins], axis=1)

#html #pandas #python #visualization

Display two Dataframes side by side

import numpy as np
import pandas as pd   
from IPython.display import display_html 

df1 = pd.DataFrame(np.arange(12).reshape((3,4)),columns=['A','B','C','D',])
df2 = pd.DataFrame(np.arange(16).reshape((4,4)),columns=['A','B','C','D',])

df1_styler = df1.style.set_table_attributes("style='display:inline'").set_caption('Caption table 1')
df2_styler = df2.style.set_table_attributes("style='display:inline'").set_caption('Caption table 2')

display_html(df1_styler._repr_html_()+df2_styler._repr_html_(), raw=True)

#python #pandas

Append DataFrame in Pandas

import pandas as pd
	
df_1 = pd.DataFrame(
	[['Somu', 68, 84, 78, 96],
	['Kiku', 74, 56, 88, 85],
	['Ajit', 77, 73, 82, 87]],
	columns=['name', 'physics', 'chemistry','algebra','calculus'])

df_2 = pd.DataFrame(
	[['Amol', 72, 67, 91, 83],
	['Lini', 78, 69, 87, 92]],
	columns=['name', 'physics', 'chemistry','science','calculus'])	

frames = [df_1, df_2]

#append dataframes
df = df_1.append(df_2, ignore_index=True, sort=False)

#print dataframe
print("df_1\n------\n",df_1)
print("\ndf_2\n------\n",df_2)
print("\ndf\n--------\n",df)

#python #pandas

function to match strings in inconsistent data

# function to replace rows in the provided column of the provided dataframe
# that match the provided string above the provided ratio with the provided string
def replace_matches_in_column(df, column, string_to_match, min_ratio = 47):
    # get a list of unique strings
    strings = df[column].unique()
    
    # get the top 10 closest matches to our input string
    matches = fuzzywuzzy.process.extract(string_to_match, strings, 
                                         limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio)

    # only get matches with a ratio > 90
    close_matches = [matches[0] for matches in matches if matches[1] >= min_ratio]

    # get the rows of all the close matches in our dataframe
    rows_with_matches = df[column].isin(close_matches)

    # replace all rows with close matches with the input matches 
    df.loc[rows_with_matches, column] = string_to_match
    
    # let us know the function's done
    print("All done!")

#python #pandas

read file directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

#python #datetime #pandas

Convert Excel Days Date to real date Pandas

df['SettlementDate'] = pd.TimedeltaIndex(df['SettlementDate'], unit='d') + dt.datetime(1900,1,1)

#pandas

Counting Mulitple Values in a Python pandas Dataframe Column

df['Tags'].str.split(expand=True).stack().value_counts()

#python #pandas

Weighted Means

def weighted_median(df, median_col, weight_col):
    df_sorted = df.sort_values(median_col)
    cumsum = df_sorted[weight_col].cumsum()
    cutoff = df_sorted[weight_col].sum()/2
    return df_sorted[cumsum >= cutoff][median_col].iloc[0]

#python #pandas

pandas sample one record per unique value

df.groupby('cluster', group_keys=False).apply(lambda df: df.sample(1))

#pandas #nlp #colab

colab common useful snippets

#--------------install pytorch geometric
!python -c "import torch; print(torch.version.cuda)"
!python -c "import torch; print(torch.__version__)"
# check above version and edit below accordingly

!pip install torch==1.9.0
!pip uninstall -y torch-scatter
!pip uninstall -y torch-sparse
!pip uninstall -y torch-cluster
!pip uninstall -y torch-geometric
!pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.9.0+cu102.html
!pip install torch-sparse -f https://pytorch-geometric.com/whl/torch-1.9.0+cu102.html
!pip install torch-cluster -f https://pytorch-geometric.com/whl/torch-1.9.0+cu102.html
!pip install torch-spline-conv -f https://pytorch-geometric.com/whl/torch-1.9.0+cu102.html
!pip install torch-geometric

#--------------mount drive-------------------
from google.colab import drive
drive.mount('/content/drive')
### File path
TRAIN_ID_PATH = '/content/drive/MyDrive/folder/pytorch/train.csv'

#pandas #nlp

Working with null nan nul

df.isnull().sum()

#check where is null in a columns
df[df["Business Description"].isnull() == True]

is_NaN = df.isnull()
row_has_NaN = is_NaN.any(axis=1)
rows_with_NaN = df[row_has_NaN]

print(rows_with_NaN)

#pandas #nlp

convert dataframe to txt, to list

df['line'].iloc[:2981] = 'train'
df['line'].iloc[2982:] = 'test'

with open('job_post_01.txt', 'a') as f:
    dfAsString = df.to_string(header=False, index=True)
    f.write(dfAsString)

f.close()

data = pd.read_csv('job_post_01.csv')
df = data.sample(frac = 1).reset_index(drop = True)
doc_name_list = df.values.tolist()
doc_train_list = df.iloc[:2981].values.tolist()
doc_test_list = df.iloc[2982:].values.tolist()

#python #pandas #column #nlp #chinese #trasim

remove punc and stopword chinese

# remove punc, segment and stopword
def punc_jieba(text, sep = ' '):
#     stopword = stopwords(["zh"])
    text_punc = re.sub("[\s+\>\<\:\?\.\!\/_,$%^*(+\"\']+|[+——！，。？、~@#￥%……&*（）！，❤。～《》：（）【】「」？”“；：、【】╮╯▽╰╭★→「」]+".encode().decode("utf8"),
                        "",text)
    text_cut = sep.join(jieba.cut(text_punc, cut_all=False)).lower()
#     tokens = word_tokenize(text_cut)
#     clean_text = [word for word in tokens if not word in stopword]
    
    return text_cut
# mothod1
def stop_word(text):
    stopword = stopwords(['zh'])
    remove_stw = [word for word in text if not word in stopword]
    return remove_stw
df['text'] = df['text'].apply(stop_word)
# mothod2
stopword = stopwords(['zh'])
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopword)]))

#python #pandas #column #nlp #chinese #trasim

convert tra to sim chinese

sudo pip install opencc
# if nt work, should clone project first

import pandas as pd
import numpy as np
# -*- coding: utf-8 -*-
import opencc
from opencc import OpenCC

df = pd.read_csv('training.csv').astype(str)

def tra_sim(text):
    cc = OpenCC('tw2s')
    sim = cc.convert(text)
    return sim
df['sim_label'] = df['label'].apply(tra_sim)
df['sim_detail_label'] = df['detail_label'].apply(tra_sim)
df['sim_text'] = df['text'].apply(tra_sim)

#python #pandas #neuralnetwork

read file .sh in window jupyter

%%bash
!sh ./textcleaner.sh

#python #pandas #regex

Python regex related snippets

https://vimsky.com/zh-tw/examples/detail/python-method-regex.sub.html

# Match text between two strings with regular expression
clean_text = re.search(r'before(.*?)after', text).group(1)
import re
s = 'Part 1. Part 2. Part 3 then more text'
re.search(r'Part 1\.(.*?)Part 3', s).group(1)
' Part 2. '
re.search(r'Part 1(.*?)Part 3', s).group(1)
'. Part 2. '

#python #pandas

Read filenames in folder

all_filenames = glob.glob("/home/lynaza/Desktop/Quinn/lda/檢察官起訴書/*.txt")

#return only filename (may contain not only duoi file)
 import os
 arr = os.listdir("/home/lynaza/Desktop/Quinn/lda/檢察官起訴書")
 print(arr)



import cv2
import os
import glob

def load_images_name(path):
    
    list_1 = glob.glob(path+'/*.tif') # depth of 1 folder
    
    list_2 = glob.glob(path+'/*/*.tif') # depth of 2 folder
    
    list_3 = glob.glob(path+'/*/*/*.tif')  # depth of 3 folder
    
    list_4 = glob.glob(path+'/*/*/*/*.tif')  # depth of 4 folder
    
    images_path = list_1 +list_2 +list_3 + list_4

    return images_path

images = load_images_name("/home/lynaza/Desktop/traindata/test")

#textpreprocessing #nlp #function #pandas

apply function and def

rmsval = df.loc[:, 'c1':'c4']
def getrms(row):  
  a = np.sqrt(sum(row**2/4))
  return a
df['rms'] = df.apply(getrms,axis=1)
df.head()

#textpreprocessing #nlp #pandas #function

replace function in dataframe

for c in df_drop.columns:
    df_drop[c] = df_drop[c].str.replace('[^\w\s]+', '')
df_drop = df_drop.astype(str)
df_drop.head()

#python #textpreprocessing #nlp #dataframe #pandas #save #export

create and save dataframe to csv

import pandas as pd

data = {'Product': ['Desktop Computer','Tablet','Printer','Laptop'],
        'Price': [850,200,150,1300]
        }

df = pd.DataFrame(data, columns= ['Product', 'Price'])

df.to_csv(r'Path where you want to store the exported CSV file\File Name.csv')
# df.to_csv('file_name.csv', encoding='utf-8', index=False)
print (df)

data[['column1','column2','column3',...]].to_csv('fileNameWhereYouwantToWrite.csv')
     
df = pd.DataFrame()
for i in range():
	#....
	df.appen(text)

#py #dataframe #pandas #combine #column

Combine columns in dataframe

# best way
data['resume'] = data[['Resume_title', 'City', 'State', 'Description', 'work_experiences', 'Educations', 'Skills', 'Certificates', 'Additional Information']].agg(' '.join, axis=1)


# other way
df["period"] = df["Year"] + df["quarter"]
df['Period'] = df['Year'] + ' ' + df['Quarter']
df["period"] = df["Year"].astype(str) + df["quarter"] #If one (or both) of the columns are not string typed
#Beware of NaNs when doing this!
df['period'] = df[['Year', 'quarter', ...]].agg('-'.join, axis=1) #for multiple string columns
df['period'] = df[['Year', 'quarter']].apply(lambda x: ''.join(x), axis=1)
#method cat() of the .str accessor 
df['Period'] = df.Year.str.cat(df.Quarter)
df['Period'] = df.Year.astype(str).str.cat(df.Quarter.astype(str), sep='q')
df['AllTogether'] = df['Country'].str.cat(df[['State', 'City']], sep=' - ') #add parameter na_rep to replace the NaN values with a string if have nan
columns = ['whatever', 'columns', 'you', 'choose']
df['period'] = df[columns].astype(str).sum(axis=1)

#a function
def str_join(df, sep, *cols):
   ...:     from functools import reduce
   ...:     return reduce(lambda x, y: x.astype(str).str.cat(y.astype(str), sep=sep), 
   ...:                   [df[col] for col in cols])
   ...: 

In [4]: df['cat'] = str_join(df, '-', 'c0', 'c1', 'c2', 'c3')

#py #dataframe #pandas #replace

replace function in dataframe

for c in df_drop.columns:
    df_drop[c] = df_drop[c].str.replace('[^\w\s]+', '')
df_drop = df_drop.astype(str)
df_drop.head()

#py #dataframe #pandas

apply function and def

rmsval = df.loc[:, 'c1':'c4']
def getrms(row):  
  a = np.sqrt(sum(row**2/4))
  return a
df['rms'] = df.apply(getrms,axis=1)
df.head()

#py #dataframe #pandas

create and save dataframe to csv

import pandas as pd

data = {'Product': ['Desktop Computer','Tablet','Printer','Laptop'],
        'Price': [850,200,150,1300]
        }

df = pd.DataFrame(data, columns= ['Product', 'Price'])

df.to_csv(r'Path where you want to store the exported CSV file\File Name.csv')

print (df)

#py #dataframe #pandas #text #exception

Exception and search text between 2 strings

import re

text = 'this is a text'

try:
    found = re.search('is(.+?)text', text).group(1)
except AttributeError:
    # AAA, ZZZ not found in the original string
    found = '0 wtitle' # apply your error handling
found

=> a

#py #dataframe #pandas

simply-create-dataframe

import pandas as pd, re

junk = """Shot - Wounded/Injured, Shot - Dead (murder, accidental, suicide), Suicide - Attempt, Murder/Suicide, Attempted Murder/Suicide (one variable unsuccessful), Institution/Group/Business, Mass Murder (4+ deceased victims excluding the subject/suspect/perpetrator , one location), Mass Shooting (4+ victims injured or killed excluding the subject/suspect"""

rx = re.compile(r'\([^()]+\)|,(\s+)')

data = [x 
        for nugget in rx.split(junk) if nugget
        for x in [nugget.strip()] if x]

df = pd.DataFrame({'incident_characteristics': data})
print(df)

#python #pandas

Cleaning special characters from text

def clean(txt):
    txt = txt.str.replace("(<br/>)", "")
    txt = txt.str.replace('(<a).*(>).*(</a>)', '')
    txt = txt.str.replace('(&amp)', '')
    txt = txt.str.replace('(&gt)', '')
    txt = txt.str.replace('(&lt)', '')
    txt = txt.str.replace('(\xa0)', ' ')  
    return txt
df['xxx column'] = clean(df['xxx column'])

#python #pandas

Heatmap corr

mask=np.triu(np.ones_like(corr,dtype=bool))

f ,ax = plt.subplots(figsize=(11,9))
cmap=sns.diverging_palette(230,20, as_cmap=True)
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
           square=True, linewidth=.5, cbar_kws={'shrink':.5})

#python #pandas

Annotate V2

for p in ax.patches:
    values= '{:.0f}'.format(p.get_height())
    x = p.get_x() + p.get_width()/2
    y = p.get_height()
    ax.annotate(values, (x, y),ha='center', va ='bottom', fontsize = 11)

#python #pandas

Annotate V2

for p in ax.patches:
    values= '{:.0f}'.format(p.get_height())
    x = p.get_x() + p.get_width()/2
    y = p.get_height()
    ax.annotate(values, (x, y),ha='center', va ='bottom', fontsize = 11)

#python #pandas

Annotate

g = sns.*plot 
ax = g 
for p in ax.patches:
    ax.text(p.get_x() + p.get_width()/2., p.get_height(), '{0:.2f}'.format(p.get_height()), 
        fontsize=12, color='black', ha='center', va='bottom')

#undefined #python #pandas

django start

django-admin startproject mysite
 
python manage.py startapp myapp

#undefined #python #pandas

Pandas Profiling

import pandas as pd
import matplotlib.pyplot as plt

from pandas_profiling import ProfileReport
profile = ProfileReport(gabijos, title='Gabijos g.', html={'style':{'full_width':True}})
profile.to_file("gabijos.html")

#python #pandas

pandas update column value

df_query = df_query.assign(comments='NoComment')

#python #pandas

compare differences between two dataframes

qq= dff[~df.astype(str).apply(tuple, 1).isin(dff.astype(str).apply(tuple, 1))]

#python #pandas

value text on bar charts

for p in ax.patches:
    values = '{:.0f}'.format(p.get_height())
    x = p.get_x() + p.get_width()/2
    y = p.get_height()
    ax.annotate(values, (x, y),ha='center', va ='bottom', fontsize = 10)

#python #pandas

# Seperate various categorical words that are in a single column

.apply(lambda x: x.replace(',',',').replace(',',',').split(',')

#matplotlib #pandas #seaborn #ensemble #voting #classifier #xgb #boosting

Importing libraries

# importing libraries 
from sklearn.ensemble import VotingClassifier ,BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier,GradientBoostingClassifier
from sklearn.metrics import accuracy_score 
from numpy import mean,std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score,RepeatedStratifiedKFold,train_test_split
from sklearn.linear_model import LogisticRegression,RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from matplotlib import pyplot
from sklearn.datasets import load_wine,load_iris
from matplotlib.pyplot import figure
figure(num=2, figsize=(16, 12), dpi=80, facecolor='w', edgecolor='k')
import xgboost as xgb
from sklearn.feature_selection import SelectKBest,f_regression
from sklearn.linear_model import LinearRegression,BayesianRidge,ElasticNet,Lasso,SGDRegressor,Ridge
from sklearn.kernel_ridge import KernelRidge
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,RobustScaler,StandardScaler
from sklearn.pipeline import make_pipeline,Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA,KernelPCA
from sklearn.ensemble import ExtraTreesRegressor,GradientBoostingRegressor,RandomForestRegressor,VotingClassifier
from sklearn.model_selection import cross_val_score,KFold,GridSearchCV,RandomizedSearchCV,StratifiedKFold,train_test_split
from sklearn.base import BaseEstimator,clone,TransformerMixin,RegressorMixin
from sklearn.svm import LinearSVR,SVR
#import xgboost 
from xgboost import XGBRegressor
#Import Pandas
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from scipy.stats import skew
from scipy.stats.stats import pearsonr
%matplotlib inline
seed = 1075
np.random.seed(seed)

#pandas #isin #filter

Filter column using isin Pandas

c3 = pd.Series(['China', 'US'])
df[df['countries'].isin(c1)]

#pandas #filter #column

Pandas filter dataframe columns

# applying filter function 
df.filter(["Name", "College", "Salary"])

#pandas #filter

Pandas data frame filter column names regex

# importing pandas as pd 
import pandas as pd 
  
# Creating the dataframe  
df = pd.read_csv("nba.csv") 
  
# Using regular expression to extract all 
# columns which has letter 'a' or 'A' in its name. 
df.filter(regex ='[aA]')

#python #pandas #format #currency

Style pandas value to currency

(df.groupby('name')['ext price']
 .agg(['mean', 'sum'])
 .style.format('${0:,.2f}'))

#python #pandas #formatting

Format a float as currency in Python

'${:,.2f}'.format(dfCombined['Amount'].sum())

#pandas #duplicates #drop

Pandas.drop_duplicates

df.drop_duplicates()

#python #pandas

How to reorder columns in a Pandas DataFrame in Python

df = df[["C", "A", "B"]]

#python #pandas #grouper

Pandas Grouper

df['column_name'] = pd.to_datetime(df['column_name'])
# new version
df.groupby(pd.Grouper(key='column_name', freq="M")).mean().plot()

#python #pandas #data-cleaning

Forward fills Column names

def ffill_cols(df, cols_to_fill_name='Unn'):
    """
    Forward fills column names. Propagate last valid column name forward to next invalid column. Works similarly to pandas
    ffill().
    
    :param df: pandas Dataframe; Dataframe
    :param cols_to_fill_name: str; The name of the columns you would like forward filled. Default is 'Unn' as
    the default name pandas gives unnamed columns is 'Unnamed'
    
    :returns: list; List of new column names
    """
    cols = df.columns.to_list()
    for i, j in enumerate(cols):
        if j.startswith(cols_to_fill_name):
            cols[i] = cols[i-1]
    return cols