Snippets Collections
import pandas pd

excel_file = pd.read_excel(‘file.xlsx’, sheet_name=None)
dataset_combined = pd.concat(excel_file.values())
df['year'] = df['date'].dt.year if we put dt.month is for month etc..
def find_boundaries(df, variable, distance=1.5):

    IQR = df[variable].quantile(0.75) - df[variable].quantile(0.25)

    lower_boundary = df[variable].quantile(0.25) - (IQR * distance)
    upper_boundary = df[variable].quantile(0.75) + (IQR * distance)

    return upper_boundary, lower_boundary
plt.ylabel('Number of unique categories')

## Version with 5% threshold

fig = label_freq.sort_values(ascending=False)
fig.axhline(y=0.05, color='red')
fig.set_ylabel('percentage of cars within each category')
fig.set_xlabel('Variable: class')
fig.set_title('Identifying Rare Categories')
plt.ylabel('Percentage of missing values')
plt.title('Quantifying missing data')
# Get the Numerical Data list to infer distribution plots

numerical = [var for var in df.columns if df[var].dtype!='O'] 
print('There are {} numerical variables\n'.format(len(numerical))) 
print('The numerical variables are :', numerical)

# Get the Categorical Data list to infer distribution plots

categorical = [var for var in df.columns if df[var].dtype =='O'] 
print('There are {} Categorical variables\n'.format(len(categorical))) 
print('The Categorical variables are :', categorical)
#Add the new column which gives a unique number to each of these labels 

df['label_num'] = df['label'].map({
    'Household' : 0, 
    'Books': 1, 
    'Electronics': 2, 
    'Clothing & Accessories': 3

#checking the results 
# Setup fake key in both DataFrames to join on it

df1['key'] = 0
df2['key'] = 0

df1.merge(df2, on='key', how='outer')
from pandas_profiling import ProfileReport
import pandas as pd

test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')
train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')

profile = ProfileReport(train, title="train Pandas Profiling Report")
profile = ProfileReport(test, title="test Pandas Profiling Report")
#You can do it using GloVe library:

#Install it: 

!pip install glove_python

from glove import Corpus, Glove

#Creating a corpus object
corpus = Corpus() 

#Training the corpus to generate the co-occurrence matrix which is used in GloVe, window=10)

glove = Glove(no_components=5, learning_rate=0.05), epochs=30, no_threads=4, verbose=True)

 #for Fasttext
 from gensim.models import FastText
from gensim.test.utils import common_texts  # some example sentences
['human', 'interface', 'computer']
model = FastText(vector_size=4, window=3, min_count=1)  # instantiate
model.train(sentences=common_texts, total_examples=len(common_texts), epochs=10)  # train
model2 = FastText(vector_size=4, window=3, min_count=1, sentences=common_texts, epochs=10)

import numpy as np
np.allclose(model.wv['computer'], model2.wv['computer'])

from gensim.test.utils import datapath
corpus_file = datapath('lee_background.cor')  # absolute path to corpus
model3 = FastText(vector_size=4, window=3, min_count=1)
model3.build_vocab(corpus_file=corpus_file)  # scan over corpus to build the vocabulary
total_words = model3.corpus_total_words  # number of words in the corpus
model3.train(corpus_file=corpus_file, total_words=total_words, epochs=5)

from gensim.utils import tokenize
from gensim import utils
class MyIter:
    def __iter__(self):
        path = datapath('crime-and-punishment.txt')
        with, 'r', encoding='utf-8') as fin:
            for line in fin:
                yield list(tokenize(line))
model4 = FastText(vector_size=4, window=3, min_count=1)
total_examples = model4.corpus_count
model4.train(sentences=MyIter(), total_examples=total_examples, epochs=5)
from gensim.test.utils import get_tmpfile
fname = get_tmpfile("fasttext.model")
model = FastText.load(fname)

In [1]: df = pd.DataFrame( {'a':['A','A','B','B','B','C'], 'b':[1,2,5,5,4,6]})

   a  b
0  A  1
1  A  2
2  B  5
3  B  5
4  B  4
5  C  6

In [2]: df.groupby('a')['b'].apply(list)
A       [1, 2]
B    [5, 5, 4]
C          [6]
Name: b, dtype: object

In [3]: df1 = df.groupby('a')['b'].apply(list).reset_index(name='new')
   a        new
0  A     [1, 2]
1  B  [5, 5, 4]
2  C        [6]
df2 = df1.filter(['Col 1', 'Col 2'], axis=1)
df2.write_csv('output.csv', index=False) # do not create a separate column index
import pandas as pd
from import storage

BUCKET_NAME = 'zhibo-work'

# Create a Cloud Storage client to download the data
storage_client = storage.Client()

# Download the data
data_bucket = storage_client.bucket(BUCKET_NAME)
blob = data_bucket.blob('description/data_gics.csv')
def load_table_to_df(fp, **kwargs):

    if fp.endswith('.xls'):
        result_df = pd.read_excel(fp, **kwargs)
    elif fp.endswith('.xlsx'):
        result_df = pd.read_excel(fp, engine='openpyxl', **kwargs)
    elif fp.endswith('.csv') or fp.endswith('.txt'):
        result_df = pd.read_csv(fp, **kwargs)
    elif fp.endswith('.parquet'):
        result_df = pd.read_parquet(fp, **kwargs)
        return "Wrong file extension"

    print('     -->  Succesfully loaded {}'.format(fp))
    return result_df
lins = pd.Series(lines, name='img_path')
train_df = pd.concat([train_df, lins], axis=1)
import numpy as np
import pandas as pd   
from IPython.display import display_html 

df1 = pd.DataFrame(np.arange(12).reshape((3,4)),columns=['A','B','C','D',])
df2 = pd.DataFrame(np.arange(16).reshape((4,4)),columns=['A','B','C','D',])

df1_styler ="style='display:inline'").set_caption('Caption table 1')
df2_styler ="style='display:inline'").set_caption('Caption table 2')

display_html(df1_styler._repr_html_()+df2_styler._repr_html_(), raw=True)
import pandas as pd
df_1 = pd.DataFrame(
	[['Somu', 68, 84, 78, 96],
	['Kiku', 74, 56, 88, 85],
	['Ajit', 77, 73, 82, 87]],
	columns=['name', 'physics', 'chemistry','algebra','calculus'])

df_2 = pd.DataFrame(
	[['Amol', 72, 67, 91, 83],
	['Lini', 78, 69, 87, 92]],
	columns=['name', 'physics', 'chemistry','science','calculus'])	

frames = [df_1, df_2]

#append dataframes
df = df_1.append(df_2, ignore_index=True, sort=False)

#print dataframe
# function to replace rows in the provided column of the provided dataframe
# that match the provided string above the provided ratio with the provided string
def replace_matches_in_column(df, column, string_to_match, min_ratio = 47):
    # get a list of unique strings
    strings = df[column].unique()
    # get the top 10 closest matches to our input string
    matches = fuzzywuzzy.process.extract(string_to_match, strings, 
                                         limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio)

    # only get matches with a ratio > 90
    close_matches = [matches[0] for matches in matches if matches[1] >= min_ratio]

    # get the rows of all the close matches in our dataframe
    rows_with_matches = df[column].isin(close_matches)

    # replace all rows with close matches with the input matches 
    df.loc[rows_with_matches, column] = string_to_match
    # let us know the function's done
    print("All done!")
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename)) 
df['SettlementDate'] = pd.TimedeltaIndex(df['SettlementDate'], unit='d') + dt.datetime(1900,1,1)
def weighted_median(df, median_col, weight_col):
    df_sorted = df.sort_values(median_col)
    cumsum = df_sorted[weight_col].cumsum()
    cutoff = df_sorted[weight_col].sum()/2
    return df_sorted[cumsum >= cutoff][median_col].iloc[0]
df.groupby('cluster', group_keys=False).apply(lambda df: df.sample(1))
#--------------install pytorch geometric
!python -c "import torch; print(torch.version.cuda)"
!python -c "import torch; print(torch.__version__)"
# check above version and edit below accordingly

!pip install torch==1.9.0
!pip uninstall -y torch-scatter
!pip uninstall -y torch-sparse
!pip uninstall -y torch-cluster
!pip uninstall -y torch-geometric
!pip install torch-scatter -f
!pip install torch-sparse -f
!pip install torch-cluster -f
!pip install torch-spline-conv -f
!pip install torch-geometric

#--------------mount drive-------------------
from google.colab import drive
### File path
TRAIN_ID_PATH = '/content/drive/MyDrive/folder/pytorch/train.csv'

#check where is null in a columns
df[df["Business Description"].isnull() == True]

is_NaN = df.isnull()
row_has_NaN = is_NaN.any(axis=1)
rows_with_NaN = df[row_has_NaN]

df['line'].iloc[:2981] = 'train'
df['line'].iloc[2982:] = 'test'

with open('job_post_01.txt', 'a') as f:
    dfAsString = df.to_string(header=False, index=True)


data = pd.read_csv('job_post_01.csv')
df = data.sample(frac = 1).reset_index(drop = True)
doc_name_list = df.values.tolist()
doc_train_list = df.iloc[:2981].values.tolist()
doc_test_list = df.iloc[2982:].values.tolist()
# remove punc, segment and stopword
def punc_jieba(text, sep = ' '):
#     stopword = stopwords(["zh"])
    text_punc = re.sub("[\s+\>\<\:\?\.\!\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*()!,❤。~《》:()【】「」?”“;:、【】╮╯▽╰╭★→「」]+".encode().decode("utf8"),
    text_cut = sep.join(jieba.cut(text_punc, cut_all=False)).lower()
#     tokens = word_tokenize(text_cut)
#     clean_text = [word for word in tokens if not word in stopword]
    return text_cut
# mothod1
def stop_word(text):
    stopword = stopwords(['zh'])
    remove_stw = [word for word in text if not word in stopword]
    return remove_stw
df['text'] = df['text'].apply(stop_word)
# mothod2
stopword = stopwords(['zh'])
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopword)]))
sudo pip install opencc
# if nt work, should clone project first

import pandas as pd
import numpy as np
# -*- coding: utf-8 -*-
import opencc
from opencc import OpenCC

df = pd.read_csv('training.csv').astype(str)

def tra_sim(text):
    cc = OpenCC('tw2s')
    sim = cc.convert(text)
    return sim
df['sim_label'] = df['label'].apply(tra_sim)
df['sim_detail_label'] = df['detail_label'].apply(tra_sim)
df['sim_text'] = df['text'].apply(tra_sim)
all_filenames = glob.glob("/home/lynaza/Desktop/Quinn/lda/檢察官起訴書/*.txt")

#return only filename (may contain not only duoi file)
 import os
 arr = os.listdir("/home/lynaza/Desktop/Quinn/lda/檢察官起訴書")

import cv2
import os
import glob

def load_images_name(path):
    list_1 = glob.glob(path+'/*.tif') # depth of 1 folder
    list_2 = glob.glob(path+'/*/*.tif') # depth of 2 folder
    list_3 = glob.glob(path+'/*/*/*.tif')  # depth of 3 folder
    list_4 = glob.glob(path+'/*/*/*/*.tif')  # depth of 4 folder
    images_path = list_1 +list_2 +list_3 + list_4

    return images_path

images = load_images_name("/home/lynaza/Desktop/traindata/test")
rmsval = df.loc[:, 'c1':'c4']
def getrms(row):  
  a = np.sqrt(sum(row**2/4))
  return a
df['rms'] = df.apply(getrms,axis=1)
for c in df_drop.columns:
    df_drop[c] = df_drop[c].str.replace('[^\w\s]+', '')
df_drop = df_drop.astype(str)
import pandas as pd

data = {'Product': ['Desktop Computer','Tablet','Printer','Laptop'],
        'Price': [850,200,150,1300]

df = pd.DataFrame(data, columns= ['Product', 'Price'])

df.to_csv(r'Path where you want to store the exported CSV file\File Name.csv')
# df.to_csv('file_name.csv', encoding='utf-8', index=False)
print (df)

df = pd.DataFrame()
for i in range():
# best way
data['resume'] = data[['Resume_title', 'City', 'State', 'Description', 'work_experiences', 'Educations', 'Skills', 'Certificates', 'Additional Information']].agg(' '.join, axis=1)

# other way
df["period"] = df["Year"] + df["quarter"]
df['Period'] = df['Year'] + ' ' + df['Quarter']
df["period"] = df["Year"].astype(str) + df["quarter"] #If one (or both) of the columns are not string typed
#Beware of NaNs when doing this!
df['period'] = df[['Year', 'quarter', ...]].agg('-'.join, axis=1) #for multiple string columns
df['period'] = df[['Year', 'quarter']].apply(lambda x: ''.join(x), axis=1)
#method cat() of the .str accessor 
df['Period'] =
df['Period'] = df.Year.astype(str), sep='q')
df['AllTogether'] = df['Country'][['State', 'City']], sep=' - ') #add parameter na_rep to replace the NaN values with a string if have nan
columns = ['whatever', 'columns', 'you', 'choose']
df['period'] = df[columns].astype(str).sum(axis=1)

#a function
def str_join(df, sep, *cols):
   ...:     from functools import reduce
   ...:     return reduce(lambda x, y: x.astype(str), sep=sep), 
   ...:                   [df[col] for col in cols])

In [4]: df['cat'] = str_join(df, '-', 'c0', 'c1', 'c2', 'c3')
for c in df_drop.columns:
    df_drop[c] = df_drop[c].str.replace('[^\w\s]+', '')
df_drop = df_drop.astype(str)
rmsval = df.loc[:, 'c1':'c4']
def getrms(row):  
  a = np.sqrt(sum(row**2/4))
  return a
df['rms'] = df.apply(getrms,axis=1)
import pandas as pd

data = {'Product': ['Desktop Computer','Tablet','Printer','Laptop'],
        'Price': [850,200,150,1300]

df = pd.DataFrame(data, columns= ['Product', 'Price'])

df.to_csv(r'Path where you want to store the exported CSV file\File Name.csv')

print (df)
import re

text = 'this is a text'

    found ='is(.+?)text', text).group(1)
except AttributeError:
    # AAA, ZZZ not found in the original string
    found = '0 wtitle' # apply your error handling

=> a
import pandas as pd, re

junk = """Shot - Wounded/Injured, Shot - Dead (murder, accidental, suicide), Suicide - Attempt, Murder/Suicide, Attempted Murder/Suicide (one variable unsuccessful), Institution/Group/Business, Mass Murder (4+ deceased victims excluding the subject/suspect/perpetrator , one location), Mass Shooting (4+ victims injured or killed excluding the subject/suspect"""

rx = re.compile(r'\([^()]+\)|,(\s+)')

data = [x 
        for nugget in rx.split(junk) if nugget
        for x in [nugget.strip()] if x]

df = pd.DataFrame({'incident_characteristics': data})
def clean(txt):
    txt = txt.str.replace("(<br/>)", "")
    txt = txt.str.replace('(<a).*(>).*(</a>)', '')
    txt = txt.str.replace('(&amp)', '')
    txt = txt.str.replace('(&gt)', '')
    txt = txt.str.replace('(&lt)', '')
    txt = txt.str.replace('(\xa0)', ' ')  
    return txt
df['xxx column'] = clean(df['xxx column'])

f ,ax = plt.subplots(figsize=(11,9))
cmap=sns.diverging_palette(230,20, as_cmap=True)
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
           square=True, linewidth=.5, cbar_kws={'shrink':.5})
for p in ax.patches:
    values= '{:.0f}'.format(p.get_height())
    x = p.get_x() + p.get_width()/2
    y = p.get_height()
    ax.annotate(values, (x, y),ha='center', va ='bottom', fontsize = 11)
for p in ax.patches:
    values= '{:.0f}'.format(p.get_height())
    x = p.get_x() + p.get_width()/2
    y = p.get_height()
    ax.annotate(values, (x, y),ha='center', va ='bottom', fontsize = 11)
g = sns.*plot 
ax = g 
for p in ax.patches:
    ax.text(p.get_x() + p.get_width()/2., p.get_height(), '{0:.2f}'.format(p.get_height()), 
        fontsize=12, color='black', ha='center', va='bottom')
django-admin startproject mysite
python startapp myapp
import pandas as pd
import matplotlib.pyplot as plt

from pandas_profiling import ProfileReport
profile = ProfileReport(gabijos, title='Gabijos g.', html={'style':{'full_width':True}})

df_query = df_query.assign(comments='NoComment')
qq= dff[~df.astype(str).apply(tuple, 1).isin(dff.astype(str).apply(tuple, 1))]
for p in ax.patches:
    values = '{:.0f}'.format(p.get_height())
    x = p.get_x() + p.get_width()/2
    y = p.get_height()
    ax.annotate(values, (x, y),ha='center', va ='bottom', fontsize = 10)
.apply(lambda x: x.replace(',',',').replace(',',',').split(',')
# importing libraries 
from sklearn.ensemble import VotingClassifier ,BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier,GradientBoostingClassifier
from sklearn.metrics import accuracy_score 
from numpy import mean,std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score,RepeatedStratifiedKFold,train_test_split
from sklearn.linear_model import LogisticRegression,RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from matplotlib import pyplot
from sklearn.datasets import load_wine,load_iris
from matplotlib.pyplot import figure
figure(num=2, figsize=(16, 12), dpi=80, facecolor='w', edgecolor='k')
import xgboost as xgb
from sklearn.feature_selection import SelectKBest,f_regression
from sklearn.linear_model import LinearRegression,BayesianRidge,ElasticNet,Lasso,SGDRegressor,Ridge
from sklearn.kernel_ridge import KernelRidge
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,RobustScaler,StandardScaler
from sklearn.pipeline import make_pipeline,Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA,KernelPCA
from sklearn.ensemble import ExtraTreesRegressor,GradientBoostingRegressor,RandomForestRegressor,VotingClassifier
from sklearn.model_selection import cross_val_score,KFold,GridSearchCV,RandomizedSearchCV,StratifiedKFold,train_test_split
from sklearn.base import BaseEstimator,clone,TransformerMixin,RegressorMixin
from sklearn.svm import LinearSVR,SVR
#import xgboost 
from xgboost import XGBRegressor
#Import Pandas
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import warnings
from scipy.stats import skew
from scipy.stats.stats import pearsonr
%matplotlib inline
seed = 1075
c3 = pd.Series(['China', 'US'])
# applying filter function 
df.filter(["Name", "College", "Salary"]) 

# importing pandas as pd 
import pandas as pd 
# Creating the dataframe  
df = pd.read_csv("nba.csv") 
# Using regular expression to extract all 
# columns which has letter 'a' or 'A' in its name. 
df.filter(regex ='[aA]') 
(df.groupby('name')['ext price']
 .agg(['mean', 'sum'])
df['column_name'] = pd.to_datetime(df['column_name'])
# new version
df.groupby(pd.Grouper(key='column_name', freq="M")).mean().plot()
def ffill_cols(df, cols_to_fill_name='Unn'):
    Forward fills column names. Propagate last valid column name forward to next invalid column. Works similarly to pandas
    :param df: pandas Dataframe; Dataframe
    :param cols_to_fill_name: str; The name of the columns you would like forward filled. Default is 'Unn' as
    the default name pandas gives unnamed columns is 'Unnamed'
    :returns: list; List of new column names
    cols = df.columns.to_list()
    for i, j in enumerate(cols):
        if j.startswith(cols_to_fill_name):
            cols[i] = cols[i-1]
    return cols

Mon Aug 29 2022 12:59:46 GMT+0000 (UTC)

#python #pandas #dataframe #crossjoin

Thu Apr 21 2022 17:21:57 GMT+0000 (UTC)

#pandas #list #group

Wed Apr 20 2022 18:10:53 GMT+0000 (UTC)

#pandas #list #group

Thu Jan 20 2022 00:01:12 GMT+0000 (UTC)

#pandas #gcp

Thu Jan 06 2022 20:16:40 GMT+0000 (UTC)

#python #pandas #dataframe

Wed Oct 27 2021 06:23:58 GMT+0000 (UTC)

#python #pandas

Wed Oct 20 2021 01:52:13 GMT+0000 (UTC)

#python #pandas

Mon Oct 18 2021 02:42:56 GMT+0000 (UTC)


Fri Oct 15 2021 03:50:43 GMT+0000 (UTC)

#python #pandas

Thu Oct 07 2021 01:52:40 GMT+0000 (UTC)

#python #pandas

Wed Sep 01 2021 03:55:45 GMT+0000 (UTC)

#pandas #nlp #colab

Mon Aug 30 2021 17:59:16 GMT+0000 (UTC)

#pandas #nlp

Sun Aug 29 2021 18:15:41 GMT+0000 (UTC)

#pandas #nlp

Tue Jul 20 2021 04:01:34 GMT+0000 (UTC)

#python #pandas

Tue Jul 20 2021 02:52:15 GMT+0000 (UTC)

#python #pandas

Tue Jun 29 2021 16:32:19 GMT+0000 (UTC)

#py #dataframe #pandas

Tue Jun 29 2021 16:21:45 GMT+0000 (UTC)

#py #dataframe #pandas

Mon Jun 28 2021 17:29:44 GMT+0000 (UTC)

#py #dataframe #pandas

Wed May 26 2021 04:07:55 GMT+0000 (UTC)

#python #pandas

Thu May 20 2021 05:25:00 GMT+0000 (UTC)

#python #pandas

Mon May 17 2021 04:21:23 GMT+0000 (UTC)

#python #pandas

Mon May 17 2021 04:04:24 GMT+0000 (UTC)

#python #pandas

Thu May 13 2021 06:52:35 GMT+0000 (UTC)

#python #pandas

Wed Apr 07 2021 16:50:10 GMT+0000 (UTC)

#undefined #python #pandas

Wed Apr 07 2021 16:40:56 GMT+0000 (UTC)

#undefined #python #pandas

Thu Mar 25 2021 06:54:18 GMT+0000 (UTC)

#python #pandas

Wed Mar 17 2021 07:23:00 GMT+0000 (UTC)

#python #pandas

Thu Mar 11 2021 09:50:49 GMT+0000 (UTC)

#python #pandas

Sat Oct 31 2020 00:55:40 GMT+0000 (UTC)

#pandas #isin #filter

Sat Oct 31 2020 00:40:59 GMT+0000 (UTC)

#pandas #filter #column

Sat Oct 31 2020 00:38:58 GMT+0000 (UTC)

#pandas #filter

Mon Oct 26 2020 01:01:58 GMT+0000 (UTC)

#python #pandas #format #currency

Mon Oct 26 2020 00:28:49 GMT+0000 (UTC)

#python #pandas #formatting

Fri Oct 23 2020 04:54:30 GMT+0000 (UTC)

#pandas #duplicates #drop

Tue Oct 20 2020 09:28:55 GMT+0000 (UTC)

#python #pandas

Fri Oct 16 2020 22:26:07 GMT+0000 (UTC)

#python #pandas #grouper

Thu Aug 06 2020 08:57:00 GMT+0000 (UTC)

#python #pandas #data-cleaning

Save snippets that work with our extensions

Available in the Chrome Web Store Get Firefox Add-on Get VS Code extension