Snippets Collections
from sys import path
import os
path.append(os.path.realpath('../'))
from fastapi import FastAPI
from fastapi.responses import StreamingResponse
import io
import pandas as pd

app = FastAPI()

@app.get("/get_csv")
async def get_csv():
    df = pd.DataFrame(dict(col1 = 1, col2 = 2), index=[0])
    stream = io.StringIO()
    df.to_csv(stream, index = False)
    response = StreamingResponse(iter([stream.getvalue()]),
                                 media_type="text/csv"
                                )
    response.headers["Content-Disposition"] = "attachment; filename=export.csv"
    return response
# Assume -9999 is a missing data flag
df.replace(-9999, np.NaN)
# Alternatively, perform this when creating the dataframe:
pd.read_csv("https://www.atmos.albany.edu/products/metarCSV/world_metar_latest.csv", sep='\s+',na_values=['9999.00','-9999.0'])
pip install pandas-profiling

from pandas_profiling import ProfileReport
profile = ProfileReport(master_data, title="OMI v3.5 Profile Report")
profile.to_file("OMIv3.5_profilereport.html")
df['column'].unique()  # Unique Values
df['column'].nunique() # Number of unique values
df['column'].value_counts() # Both
#Post prepare_data_for_forecast
input_selected_items_df = pd.read_excel('item_list_2023-05-02.xlsx')
target_df = target_df[target_df['ITEM_ID'].isin(input_selected_items_df['ITEM_ID'])]
import pandas as pd

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None) 
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
df = pd.DataFrame()
for urls in urls_list:
	df = pd.concat([df,df_new])
def parsed_df(**kwargs):
    # First define the format and then define the function
    timeFormat = "%Y-%m-%d %H:%M:%S UTC"
    # This function will iterate over each string in a 1-d array
    # and use Pandas' implementation of strptime to convert the string into a datetime object.
    parseTime = lambda x: datetime.strptime(x, timeFormat)
    return pd.read_csv('/path/to/file.csv',parse_dates=['time'], date_parser=parseTime).set_index('time') 
port matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# creating a dictionary
sns.set_style("whitegrid")
plt.rc('font', size=16) #controls default text size
plt.rc('axes', titlesize=16) #fontsize of the title
plt.rc('axes', labelsize=16) #fontsize of the x and y labels
plt.rc('xtick', labelsize=16) #fontsize of the x tick labels
plt.rc('ytick', labelsize=16) #fontsize of the y tick labels
plt.rc('legend', fontsize=16) #fontsize of the legend

# load dataset - census income
census_income = pd.read_csv(r'../input/income/train.csv')

# define figure
fig, (ax1, ax2) = plt.subplots(2)
fig.set_size_inches(18.5, 10.5)

# plot age histogram
age_count = census_income.groupby(by=["age"])["age"].count()
ax1.bar(age_count.index, age_count, color='black')
ax1.set_ylabel("Counts")
ax1.set_xlabel("Age")

# binning age
def age_bins(age):
    if age < 29:
        return "1 - young"
    if age < 60 and age >= 29:
        return "2 - middle-aged"
    else:
        return "3 - old-aged"

# apply trans. function
census_income["age_bins"] = census_income["age"].apply(age_bins)

# group and count all entries in the same bin
age_bins_df = census_income.groupby(by=["age_bins"])["age_bins"].count()

ax2.bar(age_bins_df.index, age_bins_df, color='grey')
ax2.set_ylabel("Counts")
ax2.set_xlabel("Age")
import pandas as pd
from dataprep.eda import create_report
df = pd.read_csv("parking_violations.csv")
create_report(df)

#Plot 
plot(df)
plot_correlation(df)

#plot correlation
plot_correlation(df, col1, col2)



#load dataset houses, titanic
from dataprep.datasets import load_dataset

#plot diff training and test data
from dataprep.datasets import load_dataset
import numpy as np
df1 = load_dataset("house_prices_train")
df1 = df1.replace(" ?", np.NaN)
df2 = load_dataset("house_prices_test")
df2 = df2.replace(" ?", np.NaN)

#save Report
from dataprep.eda import create_report
report = create_report(df, title='My Report')
report.save(filename='report_01', to='~/Desktop')
# creating dummies for gender
data['Gender'] = data['Gender'].map({'Male':0, 'Female':1})
import statsmodels.formula.api as smf

model1 = smf.ols('review_score ~ wait_time', data=orders).fit()
print(model1.summary())
## MAP (for Series)
series.map(function) 
Series.map({mapping dict})

#add a column to your df operating on each row
# Let's call this "custom_sum" as "sum" is a built-in function
def custom_sum(row):
    return row.sum()
df['D'] = df.apply(custom_sum, axis=1)

#add as row
df.loc['Row 5'] = df.apply(custom_sum, axis=0)

## APPLY (for DataFrame)
df.apply(lambda col: col.max(), axis = 0)     # default axis
df.apply(lambda row: row[‘A’] + row[‘B’], axis = 1)
df.applymap(my_funct_for_indiv_elements)
    df.applymap(lambda x: '%.2f' % x)

## GROUPBY
group = df.groupby('col_A')
group.mean()
group.apply(np.mean)
group.agg({
    col_A: ['mean', np.sum],
    col_B: my_custom_sum,
    col_B: lambda s: my_custom_sum(s)
    })

group.apply(custom_mean_function)
df.groupby('name')['n'].sum().plot(kind='bar', x='name')
df['TIMESINCE'] = (pd.Timestamp.today() - df['REGDATE']).dt.days
df['YEARSSINCE'] = df['timesince'] / 365
df
# convert the 'Date' column to datetime format
df['Date']= pd.to_datetime(df['Date'])
 
# Check the format of 'Date' column
df.info()
df[df.columns[df.columns.str.contains(pat = 'WORDABC9')]]
   WORDABC9N123  WORDABC99N123
0            13             16
1            14             17
2            15             18

df[df.columns[df.columns.str.contains(pat = 'WORDABC9\\b')]]
Empty DataFrame
Columns: []
Index: [0, 1, 2]
import pandas pd

excel_file = pd.read_excel(‘file.xlsx’, sheet_name=None)
dataset_combined = pd.concat(excel_file.values())
df['year'] = df['date'].dt.year if we put dt.month is for month etc..
df.head()
def find_boundaries(df, variable, distance=1.5):

    IQR = df[variable].quantile(0.75) - df[variable].quantile(0.25)

    lower_boundary = df[variable].quantile(0.25) - (IQR * distance)
    upper_boundary = df[variable].quantile(0.75) + (IQR * distance)

    return upper_boundary, lower_boundary
data.nunique().plot.bar(figsize=(12,6))
plt.ylabel('Number of unique categories')
plt.xlabel('Variables')
plt.title('Cardinality')

## Version with 5% threshold

fig = label_freq.sort_values(ascending=False).plot.bar()
fig.axhline(y=0.05, color='red')
fig.set_ylabel('percentage of cars within each category')
fig.set_xlabel('Variable: class')
fig.set_title('Identifying Rare Categories')
plt.show()
data.isnull().mean().plot.bar(figsize=(12,6))
plt.ylabel('Percentage of missing values')
plt.xlabel('Variables')
plt.title('Quantifying missing data')
# Get the Numerical Data list to infer distribution plots

numerical = [var for var in df.columns if df[var].dtype!='O'] 
print('There are {} numerical variables\n'.format(len(numerical))) 
print('The numerical variables are :', numerical)

# Get the Categorical Data list to infer distribution plots

categorical = [var for var in df.columns if df[var].dtype =='O'] 
print('There are {} Categorical variables\n'.format(len(categorical))) 
print('The Categorical variables are :', categorical)
#Add the new column which gives a unique number to each of these labels 

df['label_num'] = df['label'].map({
    'Household' : 0, 
    'Books': 1, 
    'Electronics': 2, 
    'Clothing & Accessories': 3
})

#checking the results 
df.head(5)
# Setup fake key in both DataFrames to join on it

df1['key'] = 0
df2['key'] = 0

df1.merge(df2, on='key', how='outer')
from pandas_profiling import ProfileReport
import pandas as pd

test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')
train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')

profile = ProfileReport(train, title="train Pandas Profiling Report")
profile.to_file("train_report.html")
profile = ProfileReport(test, title="test Pandas Profiling Report")
profile.to_file("test_report.html")
#You can do it using GloVe library:

#Install it: 

!pip install glove_python

from glove import Corpus, Glove

#Creating a corpus object
corpus = Corpus() 

#Training the corpus to generate the co-occurrence matrix which is used in GloVe
corpus.fit(lines, window=10)

glove = Glove(no_components=5, learning_rate=0.05) 
glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
glove.add_dictionary(corpus.dictionary)
glove.save('glove.model')
 Save

 
 
 #for Fasttext
 from gensim.models import FastText
from gensim.test.utils import common_texts  # some example sentences
>>>
print(common_texts[0])
['human', 'interface', 'computer']
print(len(common_texts))
9
model = FastText(vector_size=4, window=3, min_count=1)  # instantiate
model.build_vocab(sentences=common_texts)
model.train(sentences=common_texts, total_examples=len(common_texts), epochs=10)  # train
model2 = FastText(vector_size=4, window=3, min_count=1, sentences=common_texts, epochs=10)

import numpy as np
>>>
np.allclose(model.wv['computer'], model2.wv['computer'])
True


from gensim.test.utils import datapath
>>>
corpus_file = datapath('lee_background.cor')  # absolute path to corpus
model3 = FastText(vector_size=4, window=3, min_count=1)
model3.build_vocab(corpus_file=corpus_file)  # scan over corpus to build the vocabulary
>>>
total_words = model3.corpus_total_words  # number of words in the corpus
model3.train(corpus_file=corpus_file, total_words=total_words, epochs=5)


from gensim.utils import tokenize
from gensim import utils
>>>
>>>
class MyIter:
    def __iter__(self):
        path = datapath('crime-and-punishment.txt')
        with utils.open(path, 'r', encoding='utf-8') as fin:
            for line in fin:
                yield list(tokenize(line))
>>>
>>>
model4 = FastText(vector_size=4, window=3, min_count=1)
model4.build_vocab(sentences=MyIter())
total_examples = model4.corpus_count
model4.train(sentences=MyIter(), total_examples=total_examples, epochs=5)
from gensim.test.utils import get_tmpfile
>>>
fname = get_tmpfile("fasttext.model")
>>>
model.save(fname)
model = FastText.load(fname)


# https://radimrehurek.com/gensim/models/fasttext.html
In [1]: df = pd.DataFrame( {'a':['A','A','B','B','B','C'], 'b':[1,2,5,5,4,6]})
        df

Out[1]: 
   a  b
0  A  1
1  A  2
2  B  5
3  B  5
4  B  4
5  C  6

In [2]: df.groupby('a')['b'].apply(list)
Out[2]: 
a
A       [1, 2]
B    [5, 5, 4]
C          [6]
Name: b, dtype: object

In [3]: df1 = df.groupby('a')['b'].apply(list).reset_index(name='new')
        df1
Out[3]: 
   a        new
0  A     [1, 2]
1  B  [5, 5, 4]
2  C        [6]
df2 = df1.filter(['Col 1', 'Col 2'], axis=1)
df2.write_csv('output.csv', index=False) # do not create a separate column index
audio_meta["duration"].astype(np.int8).value_counts().rename_axis('duration').reset_index(name='counts')
import pandas as pd
from google.cloud import storage

BUCKET_NAME = 'zhibo-work'

# Create a Cloud Storage client to download the data
storage_client = storage.Client()

# Download the data
data_bucket = storage_client.bucket(BUCKET_NAME)
blob = data_bucket.blob('description/data_gics.csv')
blob.download_to_filename('data_gics.csv')
def load_table_to_df(fp, **kwargs):

    if fp.endswith('.xls'):
        result_df = pd.read_excel(fp, **kwargs)
        
    elif fp.endswith('.xlsx'):
        result_df = pd.read_excel(fp, engine='openpyxl', **kwargs)
        
    elif fp.endswith('.csv') or fp.endswith('.txt'):
        result_df = pd.read_csv(fp, **kwargs)
        
    elif fp.endswith('.parquet'):
        result_df = pd.read_parquet(fp, **kwargs)
        
    else:
        return "Wrong file extension"

    print('     -->  Succesfully loaded {}'.format(fp))
    return result_df
lins = pd.Series(lines, name='img_path')
train_df = pd.concat([train_df, lins], axis=1)
import numpy as np
import pandas as pd   
from IPython.display import display_html 

df1 = pd.DataFrame(np.arange(12).reshape((3,4)),columns=['A','B','C','D',])
df2 = pd.DataFrame(np.arange(16).reshape((4,4)),columns=['A','B','C','D',])

df1_styler = df1.style.set_table_attributes("style='display:inline'").set_caption('Caption table 1')
df2_styler = df2.style.set_table_attributes("style='display:inline'").set_caption('Caption table 2')

display_html(df1_styler._repr_html_()+df2_styler._repr_html_(), raw=True)
import pandas as pd
	
df_1 = pd.DataFrame(
	[['Somu', 68, 84, 78, 96],
	['Kiku', 74, 56, 88, 85],
	['Ajit', 77, 73, 82, 87]],
	columns=['name', 'physics', 'chemistry','algebra','calculus'])

df_2 = pd.DataFrame(
	[['Amol', 72, 67, 91, 83],
	['Lini', 78, 69, 87, 92]],
	columns=['name', 'physics', 'chemistry','science','calculus'])	

frames = [df_1, df_2]

#append dataframes
df = df_1.append(df_2, ignore_index=True, sort=False)

#print dataframe
print("df_1\n------\n",df_1)
print("\ndf_2\n------\n",df_2)
print("\ndf\n--------\n",df)
# function to replace rows in the provided column of the provided dataframe
# that match the provided string above the provided ratio with the provided string
def replace_matches_in_column(df, column, string_to_match, min_ratio = 47):
    # get a list of unique strings
    strings = df[column].unique()
    
    # get the top 10 closest matches to our input string
    matches = fuzzywuzzy.process.extract(string_to_match, strings, 
                                         limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio)

    # only get matches with a ratio > 90
    close_matches = [matches[0] for matches in matches if matches[1] >= min_ratio]

    # get the rows of all the close matches in our dataframe
    rows_with_matches = df[column].isin(close_matches)

    # replace all rows with close matches with the input matches 
    df.loc[rows_with_matches, column] = string_to_match
    
    # let us know the function's done
    print("All done!")
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename)) 
df['SettlementDate'] = pd.TimedeltaIndex(df['SettlementDate'], unit='d') + dt.datetime(1900,1,1)
df['Tags'].str.split(expand=True).stack().value_counts()
def weighted_median(df, median_col, weight_col):
    df_sorted = df.sort_values(median_col)
    cumsum = df_sorted[weight_col].cumsum()
    cutoff = df_sorted[weight_col].sum()/2
    return df_sorted[cumsum >= cutoff][median_col].iloc[0]
df.groupby('cluster', group_keys=False).apply(lambda df: df.sample(1))
#--------------install pytorch geometric
!python -c "import torch; print(torch.version.cuda)"
!python -c "import torch; print(torch.__version__)"
# check above version and edit below accordingly

!pip install torch==1.9.0
!pip uninstall -y torch-scatter
!pip uninstall -y torch-sparse
!pip uninstall -y torch-cluster
!pip uninstall -y torch-geometric
!pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.9.0+cu102.html
!pip install torch-sparse -f https://pytorch-geometric.com/whl/torch-1.9.0+cu102.html
!pip install torch-cluster -f https://pytorch-geometric.com/whl/torch-1.9.0+cu102.html
!pip install torch-spline-conv -f https://pytorch-geometric.com/whl/torch-1.9.0+cu102.html
!pip install torch-geometric

#--------------mount drive-------------------
from google.colab import drive
drive.mount('/content/drive')
### File path
TRAIN_ID_PATH = '/content/drive/MyDrive/folder/pytorch/train.csv'
df.isnull().sum()

#check where is null in a columns
df[df["Business Description"].isnull() == True]

is_NaN = df.isnull()
row_has_NaN = is_NaN.any(axis=1)
rows_with_NaN = df[row_has_NaN]

print(rows_with_NaN)
df['line'].iloc[:2981] = 'train'
df['line'].iloc[2982:] = 'test'

with open('job_post_01.txt', 'a') as f:
    dfAsString = df.to_string(header=False, index=True)
    f.write(dfAsString)

f.close()

data = pd.read_csv('job_post_01.csv')
df = data.sample(frac = 1).reset_index(drop = True)
doc_name_list = df.values.tolist()
doc_train_list = df.iloc[:2981].values.tolist()
doc_test_list = df.iloc[2982:].values.tolist()
# remove punc, segment and stopword
def punc_jieba(text, sep = ' '):
#     stopword = stopwords(["zh"])
    text_punc = re.sub("[\s+\>\<\:\?\.\!\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*()!,❤。~《》:()【】「」?”“;:、【】╮╯▽╰╭★→「」]+".encode().decode("utf8"),
                        "",text)
    text_cut = sep.join(jieba.cut(text_punc, cut_all=False)).lower()
#     tokens = word_tokenize(text_cut)
#     clean_text = [word for word in tokens if not word in stopword]
    
    return text_cut
# mothod1
def stop_word(text):
    stopword = stopwords(['zh'])
    remove_stw = [word for word in text if not word in stopword]
    return remove_stw
df['text'] = df['text'].apply(stop_word)
# mothod2
stopword = stopwords(['zh'])
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopword)]))
sudo pip install opencc
# if nt work, should clone project first

import pandas as pd
import numpy as np
# -*- coding: utf-8 -*-
import opencc
from opencc import OpenCC

df = pd.read_csv('training.csv').astype(str)

def tra_sim(text):
    cc = OpenCC('tw2s')
    sim = cc.convert(text)
    return sim
df['sim_label'] = df['label'].apply(tra_sim)
df['sim_detail_label'] = df['detail_label'].apply(tra_sim)
df['sim_text'] = df['text'].apply(tra_sim)
https://vimsky.com/zh-tw/examples/detail/python-method-regex.sub.html

# Match text between two strings with regular expression
clean_text = re.search(r'before(.*?)after', text).group(1)
import re
s = 'Part 1. Part 2. Part 3 then more text'
re.search(r'Part 1\.(.*?)Part 3', s).group(1)
' Part 2. '
re.search(r'Part 1(.*?)Part 3', s).group(1)
'. Part 2. '
all_filenames = glob.glob("/home/lynaza/Desktop/Quinn/lda/檢察官起訴書/*.txt")

#return only filename (may contain not only duoi file)
 import os
 arr = os.listdir("/home/lynaza/Desktop/Quinn/lda/檢察官起訴書")
 print(arr)



import cv2
import os
import glob

def load_images_name(path):
    
    list_1 = glob.glob(path+'/*.tif') # depth of 1 folder
    
    list_2 = glob.glob(path+'/*/*.tif') # depth of 2 folder
    
    list_3 = glob.glob(path+'/*/*/*.tif')  # depth of 3 folder
    
    list_4 = glob.glob(path+'/*/*/*/*.tif')  # depth of 4 folder
    
    images_path = list_1 +list_2 +list_3 + list_4

    return images_path

images = load_images_name("/home/lynaza/Desktop/traindata/test")
rmsval = df.loc[:, 'c1':'c4']
def getrms(row):  
  a = np.sqrt(sum(row**2/4))
  return a
df['rms'] = df.apply(getrms,axis=1)
df.head()
for c in df_drop.columns:
    df_drop[c] = df_drop[c].str.replace('[^\w\s]+', '')
df_drop = df_drop.astype(str)
df_drop.head()
import pandas as pd

data = {'Product': ['Desktop Computer','Tablet','Printer','Laptop'],
        'Price': [850,200,150,1300]
        }

df = pd.DataFrame(data, columns= ['Product', 'Price'])

df.to_csv(r'Path where you want to store the exported CSV file\File Name.csv')
# df.to_csv('file_name.csv', encoding='utf-8', index=False)
print (df)

data[['column1','column2','column3',...]].to_csv('fileNameWhereYouwantToWrite.csv')
     
df = pd.DataFrame()
for i in range():
	#....
	df.appen(text)
# best way
data['resume'] = data[['Resume_title', 'City', 'State', 'Description', 'work_experiences', 'Educations', 'Skills', 'Certificates', 'Additional Information']].agg(' '.join, axis=1)


# other way
df["period"] = df["Year"] + df["quarter"]
df['Period'] = df['Year'] + ' ' + df['Quarter']
df["period"] = df["Year"].astype(str) + df["quarter"] #If one (or both) of the columns are not string typed
#Beware of NaNs when doing this!
df['period'] = df[['Year', 'quarter', ...]].agg('-'.join, axis=1) #for multiple string columns
df['period'] = df[['Year', 'quarter']].apply(lambda x: ''.join(x), axis=1)
#method cat() of the .str accessor 
df['Period'] = df.Year.str.cat(df.Quarter)
df['Period'] = df.Year.astype(str).str.cat(df.Quarter.astype(str), sep='q')
df['AllTogether'] = df['Country'].str.cat(df[['State', 'City']], sep=' - ') #add parameter na_rep to replace the NaN values with a string if have nan
columns = ['whatever', 'columns', 'you', 'choose']
df['period'] = df[columns].astype(str).sum(axis=1)

#a function
def str_join(df, sep, *cols):
   ...:     from functools import reduce
   ...:     return reduce(lambda x, y: x.astype(str).str.cat(y.astype(str), sep=sep), 
   ...:                   [df[col] for col in cols])
   ...: 

In [4]: df['cat'] = str_join(df, '-', 'c0', 'c1', 'c2', 'c3')
for c in df_drop.columns:
    df_drop[c] = df_drop[c].str.replace('[^\w\s]+', '')
df_drop = df_drop.astype(str)
df_drop.head()
rmsval = df.loc[:, 'c1':'c4']
def getrms(row):  
  a = np.sqrt(sum(row**2/4))
  return a
df['rms'] = df.apply(getrms,axis=1)
df.head()
import pandas as pd

data = {'Product': ['Desktop Computer','Tablet','Printer','Laptop'],
        'Price': [850,200,150,1300]
        }

df = pd.DataFrame(data, columns= ['Product', 'Price'])

df.to_csv(r'Path where you want to store the exported CSV file\File Name.csv')

print (df)
import re

text = 'this is a text'

try:
    found = re.search('is(.+?)text', text).group(1)
except AttributeError:
    # AAA, ZZZ not found in the original string
    found = '0 wtitle' # apply your error handling
found

=> a
import pandas as pd, re

junk = """Shot - Wounded/Injured, Shot - Dead (murder, accidental, suicide), Suicide - Attempt, Murder/Suicide, Attempted Murder/Suicide (one variable unsuccessful), Institution/Group/Business, Mass Murder (4+ deceased victims excluding the subject/suspect/perpetrator , one location), Mass Shooting (4+ victims injured or killed excluding the subject/suspect"""

rx = re.compile(r'\([^()]+\)|,(\s+)')

data = [x 
        for nugget in rx.split(junk) if nugget
        for x in [nugget.strip()] if x]

df = pd.DataFrame({'incident_characteristics': data})
print(df)
def clean(txt):
    txt = txt.str.replace("(<br/>)", "")
    txt = txt.str.replace('(<a).*(>).*(</a>)', '')
    txt = txt.str.replace('(&amp)', '')
    txt = txt.str.replace('(&gt)', '')
    txt = txt.str.replace('(&lt)', '')
    txt = txt.str.replace('(\xa0)', ' ')  
    return txt
df['xxx column'] = clean(df['xxx column'])
mask=np.triu(np.ones_like(corr,dtype=bool))

f ,ax = plt.subplots(figsize=(11,9))
cmap=sns.diverging_palette(230,20, as_cmap=True)
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
           square=True, linewidth=.5, cbar_kws={'shrink':.5})
for p in ax.patches:
    values= '{:.0f}'.format(p.get_height())
    x = p.get_x() + p.get_width()/2
    y = p.get_height()
    ax.annotate(values, (x, y),ha='center', va ='bottom', fontsize = 11)
for p in ax.patches:
    values= '{:.0f}'.format(p.get_height())
    x = p.get_x() + p.get_width()/2
    y = p.get_height()
    ax.annotate(values, (x, y),ha='center', va ='bottom', fontsize = 11)
g = sns.*plot 
ax = g 
for p in ax.patches:
    ax.text(p.get_x() + p.get_width()/2., p.get_height(), '{0:.2f}'.format(p.get_height()), 
        fontsize=12, color='black', ha='center', va='bottom')
django-admin startproject mysite
 
python manage.py startapp myapp
import pandas as pd
import matplotlib.pyplot as plt

from pandas_profiling import ProfileReport
profile = ProfileReport(gabijos, title='Gabijos g.', html={'style':{'full_width':True}})
profile.to_file("gabijos.html")

df_query = df_query.assign(comments='NoComment')
qq= dff[~df.astype(str).apply(tuple, 1).isin(dff.astype(str).apply(tuple, 1))]
for p in ax.patches:
    values = '{:.0f}'.format(p.get_height())
    x = p.get_x() + p.get_width()/2
    y = p.get_height()
    ax.annotate(values, (x, y),ha='center', va ='bottom', fontsize = 10)
.apply(lambda x: x.replace(',',',').replace(',',',').split(',')
# importing libraries 
from sklearn.ensemble import VotingClassifier ,BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier,GradientBoostingClassifier
from sklearn.metrics import accuracy_score 
from numpy import mean,std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score,RepeatedStratifiedKFold,train_test_split
from sklearn.linear_model import LogisticRegression,RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from matplotlib import pyplot
from sklearn.datasets import load_wine,load_iris
from matplotlib.pyplot import figure
figure(num=2, figsize=(16, 12), dpi=80, facecolor='w', edgecolor='k')
import xgboost as xgb
from sklearn.feature_selection import SelectKBest,f_regression
from sklearn.linear_model import LinearRegression,BayesianRidge,ElasticNet,Lasso,SGDRegressor,Ridge
from sklearn.kernel_ridge import KernelRidge
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,RobustScaler,StandardScaler
from sklearn.pipeline import make_pipeline,Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA,KernelPCA
from sklearn.ensemble import ExtraTreesRegressor,GradientBoostingRegressor,RandomForestRegressor,VotingClassifier
from sklearn.model_selection import cross_val_score,KFold,GridSearchCV,RandomizedSearchCV,StratifiedKFold,train_test_split
from sklearn.base import BaseEstimator,clone,TransformerMixin,RegressorMixin
from sklearn.svm import LinearSVR,SVR
#import xgboost 
from xgboost import XGBRegressor
#Import Pandas
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from scipy.stats import skew
from scipy.stats.stats import pearsonr
%matplotlib inline
seed = 1075
np.random.seed(seed)
c3 = pd.Series(['China', 'US'])
df[df['countries'].isin(c1)]
# applying filter function 
df.filter(["Name", "College", "Salary"]) 

# importing pandas as pd 
import pandas as pd 
  
# Creating the dataframe  
df = pd.read_csv("nba.csv") 
  
# Using regular expression to extract all 
# columns which has letter 'a' or 'A' in its name. 
df.filter(regex ='[aA]') 
(df.groupby('name')['ext price']
 .agg(['mean', 'sum'])
 .style.format('${0:,.2f}'))
'${:,.2f}'.format(dfCombined['Amount'].sum())
df['column_name'] = pd.to_datetime(df['column_name'])
# new version
df.groupby(pd.Grouper(key='column_name', freq="M")).mean().plot()
def ffill_cols(df, cols_to_fill_name='Unn'):
    """
    Forward fills column names. Propagate last valid column name forward to next invalid column. Works similarly to pandas
    ffill().
    
    :param df: pandas Dataframe; Dataframe
    :param cols_to_fill_name: str; The name of the columns you would like forward filled. Default is 'Unn' as
    the default name pandas gives unnamed columns is 'Unnamed'
    
    :returns: list; List of new column names
    """
    cols = df.columns.to_list()
    for i, j in enumerate(cols):
        if j.startswith(cols_to_fill_name):
            cols[i] = cols[i-1]
    return cols
star

Wed Jan 03 2024 07:55:22 GMT+0000 (Coordinated Universal Time)

#python #pandas #businesanalytics
star

Fri Dec 22 2023 09:11:04 GMT+0000 (Coordinated Universal Time) https://stackoverflow.com/questions/61140398/fastapi-return-a-file-response-with-the-output-of-a-sql-query

#python #fastapi #pandas
star

Fri Dec 01 2023 03:42:21 GMT+0000 (Coordinated Universal Time) https://stackoverflow.com/questions/34794067/how-to-set-a-cell-to-nan-in-a-pandas-dataframe

#python #pandas
star

Fri Aug 04 2023 07:02:18 GMT+0000 (Coordinated Universal Time) https://www.analyticsvidhya.com/blog/2021/06/generate-reports-using-pandas-profiling-deploy-using-streamlit/

#pandas #profile
star

Tue Jun 06 2023 07:39:55 GMT+0000 (Coordinated Universal Time)

#pandas #python #unique #dataframe
star

Fri May 19 2023 12:30:49 GMT+0000 (Coordinated Universal Time)

#python #pandas
star

Thu May 18 2023 07:01:25 GMT+0000 (Coordinated Universal Time)

#python #pandas
star

Thu Apr 27 2023 08:39:30 GMT+0000 (Coordinated Universal Time)

#pandas #dataframe
star

Mon Feb 06 2023 02:43:24 GMT+0000 (Coordinated Universal Time)

#python #pandas
star

Tue Jan 10 2023 07:25:32 GMT+0000 (Coordinated Universal Time)

#python #pandas #binning
star

Sun Nov 27 2022 06:30:20 GMT+0000 (Coordinated Universal Time) https://docs.dataprep.ai/user_guide/eda/introduction.html

#python #pandas #cost #benefit #sensitivity
star

Mon Nov 21 2022 23:06:21 GMT+0000 (Coordinated Universal Time)

#pandas
star

Sat Nov 19 2022 19:30:56 GMT+0000 (Coordinated Universal Time)

#pandas #groupby
star

Fri Nov 18 2022 22:15:19 GMT+0000 (Coordinated Universal Time)

#pandas #groupby
star

Thu Nov 03 2022 20:23:12 GMT+0000 (Coordinated Universal Time)

#pandas #groupby
star

Thu Nov 03 2022 04:23:59 GMT+0000 (Coordinated Universal Time)

#pandas #datetime
star

Tue Nov 01 2022 19:07:51 GMT+0000 (Coordinated Universal Time) https://www.geeksforgeeks.org/convert-the-column-type-from-string-to-datetime-format-in-pandas-dataframe/

#python #pandas
star

Mon Oct 24 2022 00:41:15 GMT+0000 (Coordinated Universal Time) https://stackoverflow.com/questions/68868162/filter-pandas-dataframe-by-column-name-on-regex-patterns-using-str-contains

#python #pandas
star

Mon Sep 05 2022 09:55:05 GMT+0000 (Coordinated Universal Time)

#python #pandas #dataset #eda #outliers #boundaries
star

Mon Sep 05 2022 09:53:01 GMT+0000 (Coordinated Universal Time)

#python #pandas #dataset #eda #cardibality
star

Mon Sep 05 2022 09:49:27 GMT+0000 (Coordinated Universal Time)

#python #pandas #dataset #eda #missingdata
star

Mon Aug 29 2022 12:59:46 GMT+0000 (Coordinated Universal Time)

#python #pandas #dataframe #crossjoin
star

Thu Apr 21 2022 17:21:57 GMT+0000 (Coordinated Universal Time) https://medium.com/analytics-vidhya/word-vectorization-using-glove-76919685ee0b

#pandas #list #group
star

Wed Apr 20 2022 18:10:53 GMT+0000 (Coordinated Universal Time) https://stackoverflow.com/questions/22219004/how-to-group-dataframe-rows-into-list-in-pandas-groupby

#pandas #list #group
star

Mon Mar 28 2022 21:54:30 GMT+0000 (Coordinated Universal Time)

#python #pandas
star

Sun Mar 13 2022 17:19:19 GMT+0000 (Coordinated Universal Time)

#html #pandas #python
star

Thu Jan 20 2022 00:01:12 GMT+0000 (Coordinated Universal Time)

#pandas #gcp
star

Thu Jan 06 2022 20:16:40 GMT+0000 (Coordinated Universal Time)

#python #pandas #dataframe
star

Fri Dec 24 2021 17:02:40 GMT+0000 (Coordinated Universal Time)

#python #pandas #dataframe
star

Mon Nov 15 2021 16:07:12 GMT+0000 (Coordinated Universal Time)

#html #pandas #python #visualization
star

Wed Oct 27 2021 06:23:58 GMT+0000 (Coordinated Universal Time) https://pythonexamples.org/pandas-append-dataframe/

#python #pandas
star

Thu Oct 21 2021 09:54:39 GMT+0000 (Coordinated Universal Time)

#python #pandas
star

Wed Oct 20 2021 01:52:13 GMT+0000 (Coordinated Universal Time)

#python #pandas
star

Mon Oct 18 2021 02:56:55 GMT+0000 (Coordinated Universal Time)

#python #datetime #pandas
star

Mon Oct 18 2021 02:42:56 GMT+0000 (Coordinated Universal Time) https://stackoverflow.com/a/39734034/222670

#pandas
star

Fri Oct 15 2021 03:50:43 GMT+0000 (Coordinated Universal Time)

#python #pandas
star

Thu Oct 07 2021 01:52:40 GMT+0000 (Coordinated Universal Time)

#python #pandas
star

Wed Sep 01 2021 03:55:45 GMT+0000 (Coordinated Universal Time)

#pandas #nlp #colab
star

Mon Aug 30 2021 17:59:16 GMT+0000 (Coordinated Universal Time)

#pandas #nlp
star

Sun Aug 29 2021 18:15:41 GMT+0000 (Coordinated Universal Time)

#pandas #nlp
star

Fri Aug 13 2021 06:21:01 GMT+0000 (Coordinated Universal Time)

#python #pandas #column #nlp #chinese #trasim
star

Thu Aug 12 2021 07:11:15 GMT+0000 (Coordinated Universal Time)

#python #pandas #column #nlp #chinese #trasim
star

Tue Jul 27 2021 02:01:54 GMT+0000 (Coordinated Universal Time)

#python #pandas #neuralnetwork
star

Tue Jul 20 2021 04:01:34 GMT+0000 (Coordinated Universal Time) https://stackoverflow.com/questions/32680030/match-text-between-two-strings-with-regular-expression

#python #pandas #regex
star

Tue Jul 20 2021 02:52:15 GMT+0000 (Coordinated Universal Time)

#python #pandas
star

Wed Jul 14 2021 15:19:10 GMT+0000 (Coordinated Universal Time)

#textpreprocessing #nlp #function #pandas
star

Wed Jul 14 2021 15:14:15 GMT+0000 (Coordinated Universal Time)

#textpreprocessing #nlp #pandas #function
star

Tue Jun 29 2021 19:43:47 GMT+0000 (Coordinated Universal Time)

#py #dataframe #pandas #combine #column
star

Tue Jun 29 2021 19:02:21 GMT+0000 (Coordinated Universal Time)

#py #dataframe #pandas #replace
star

Tue Jun 29 2021 16:32:19 GMT+0000 (Coordinated Universal Time)

#py #dataframe #pandas
star

Tue Jun 29 2021 16:21:45 GMT+0000 (Coordinated Universal Time)

#py #dataframe #pandas
star

Tue Jun 29 2021 16:20:27 GMT+0000 (Coordinated Universal Time)

#py #dataframe #pandas #text #exception
star

Mon Jun 28 2021 17:29:44 GMT+0000 (Coordinated Universal Time)

#py #dataframe #pandas
star

Wed May 26 2021 04:07:55 GMT+0000 (Coordinated Universal Time)

#python #pandas
star

Thu May 20 2021 05:25:00 GMT+0000 (Coordinated Universal Time)

#python #pandas
star

Mon May 17 2021 04:21:23 GMT+0000 (Coordinated Universal Time)

#python #pandas
star

Mon May 17 2021 04:04:24 GMT+0000 (Coordinated Universal Time)

#python #pandas
star

Thu May 13 2021 06:52:35 GMT+0000 (Coordinated Universal Time)

#python #pandas
star

Wed Apr 07 2021 16:50:10 GMT+0000 (Coordinated Universal Time)

#undefined #python #pandas
star

Wed Apr 07 2021 16:40:56 GMT+0000 (Coordinated Universal Time) https://pandas-profiling.github.io/pandas-profiling/docs/master/rtd/

#undefined #python #pandas
star

Thu Mar 25 2021 06:54:18 GMT+0000 (Coordinated Universal Time)

#python #pandas
star

Wed Mar 17 2021 07:23:00 GMT+0000 (Coordinated Universal Time)

#python #pandas
star

Thu Mar 11 2021 09:50:49 GMT+0000 (Coordinated Universal Time)

#python #pandas
star

Thu Mar 11 2021 09:49:43 GMT+0000 (Coordinated Universal Time)

#python #pandas
star

Mon Feb 01 2021 11:45:31 GMT+0000 (Coordinated Universal Time)

#matplotlib #pandas #seaborn #ensemble #voting #classifier #xgb #boosting
star

Sat Oct 31 2020 00:55:40 GMT+0000 (Coordinated Universal Time) https://stackoverflow.com/questions/19960077/how-to-filter-pandas-dataframe-using-in-and-not-in-like-in-sql

#pandas #isin #filter
star

Sat Oct 31 2020 00:40:59 GMT+0000 (Coordinated Universal Time) https://www.geeksforgeeks.org/python-pandas-dataframe-filter/

#pandas #filter #column
star

Sat Oct 31 2020 00:38:58 GMT+0000 (Coordinated Universal Time) https://www.geeksforgeeks.org/python-pandas-dataframe-filter/

#pandas #filter
star

Mon Oct 26 2020 01:01:58 GMT+0000 (Coordinated Universal Time) https://pbpython.com/styling-pandas.html

#python #pandas #format #currency
star

Mon Oct 26 2020 00:28:49 GMT+0000 (Coordinated Universal Time) https://www.kite.com/python/answers/how-to-format-a-float-as-currency-in-python

#python #pandas #formatting
star

Fri Oct 23 2020 04:54:30 GMT+0000 (Coordinated Universal Time) https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.drop_duplicates.html

#pandas #duplicates #drop
star

Tue Oct 20 2020 09:28:55 GMT+0000 (Coordinated Universal Time) https://www.kite.com/python/answers/how-to-reorder-columns-in-a-pandas-dataframe-in-python

#python #pandas
star

Fri Oct 16 2020 22:26:07 GMT+0000 (Coordinated Universal Time) https://stackoverflow.com/questions/47015886/pandas-grouper-vs-time-grouper

#python #pandas #grouper
star

Thu Aug 06 2020 08:57:00 GMT+0000 (Coordinated Universal Time)

#python #pandas #data-cleaning

Save snippets that work with our extensions

Available in the Chrome Web Store Get Firefox Add-on Get VS Code extension