thiscodeWorks - Organizing the best of code online

df['column'].unique()  # Unique Values
df['column'].nunique() # Number of unique values
df['column'].value_counts() # Both

#pandas #dataframe

Concate dataframe in for loop

df = pd.DataFrame()
for urls in urls_list:
	df = pd.concat([df,df_new])

#python #pandas #dataframe #crossjoin

Cross Join in Pandas

# Setup fake key in both DataFrames to join on it

df1['key'] = 0
df2['key'] = 0

df1.merge(df2, on='key', how='outer')

#python #pandas #dataframe

brondata naar pandas dataframe

def load_table_to_df(fp, **kwargs):

    if fp.endswith('.xls'):
        result_df = pd.read_excel(fp, **kwargs)
        
    elif fp.endswith('.xlsx'):
        result_df = pd.read_excel(fp, engine='openpyxl', **kwargs)
        
    elif fp.endswith('.csv') or fp.endswith('.txt'):
        result_df = pd.read_csv(fp, **kwargs)
        
    elif fp.endswith('.parquet'):
        result_df = pd.read_parquet(fp, **kwargs)
        
    else:
        return "Wrong file extension"

    print('     -->  Succesfully loaded {}'.format(fp))
    return result_df

#python #pandas #dataframe

Adding new column to a dataframe

lins = pd.Series(lines, name='img_path')
train_df = pd.concat([train_df, lins], axis=1)

##dictionary ##pandas #defining_data_frame #csv #dataframe #square #bracket'

loc and iloc - part 3

# Import cars data
import pandas as pd
cars = pd.read_csv('cars.csv', index_col = 0)

# Print out drives_right column as Series
print(cars.loc[:, 'drives_right'])
print(cars.iloc[:, 1])

# Print out drives_right column as DataFrame
print(cars.loc[:, ['drives_right']])
print(cars.iloc[:, [1]]) #[] square brackets are very sensitive here


# Print out cars_per_cap and drives_right as DataFrame
print(cars.loc[:, ['drives_right', 'cars_per_cap']])
print(cars.iloc[:, [0,2]]) #[] square brackets are very sensitive here

##dictionary ##pandas #defining_data_frame #csv #dataframe #square #bracket'

loc and iloc - part 2

# Import cars data
import pandas as pd
cars = pd.read_csv('cars.csv', index_col = 0)

# Print out drives_right value of Morocco
print(cars.loc[['MOR', 'drives_right']])

# Print sub-DataFrame
print(cars.iloc[[4,5], [1,2]])

##dictionary ##pandas #defining_data_frame #csv #dataframe #square #bracket'

loc and iloc in Pandas

# Import cars data
import pandas as pd
cars = pd.read_csv('cars.csv', index_col = 0)

# Print out observation for Japan
print(cars.loc['JPN'])
print(cars.iloc[2])

# Print out observations for Australia and Egypt
print(cars.loc[['AUS','EG']])
print(cars.iloc[[1,6]])

##dictionary ##pandas #defining_data_frame #csv #dataframe #square #bracket'

Square Bracket in Pandas

# Import cars data
import pandas as pd
cars = pd.read_csv('cars.csv', index_col = 0)

# Print out country column as Pandas Series
print(cars['country'])

# Print out country column as Pandas DataFrame
print(cars[['country']])


# Print out DataFrame with country and drives_right columns
print(cars[['country','drives_right']])

##dictionary ##pandas #defining_data_frame #csv #dataframe

CSV to DataFrame (2)

# Import pandas as pd
import pandas as pd

# Fix import by including index_col
cars = pd.read_csv('cars.csv', index_col = 0)

# Print out cars
print(cars)

#python #textpreprocessing #nlp #dataframe #pandas #save #export

create and save dataframe to csv

import pandas as pd

data = {'Product': ['Desktop Computer','Tablet','Printer','Laptop'],
        'Price': [850,200,150,1300]
        }

df = pd.DataFrame(data, columns= ['Product', 'Price'])

df.to_csv(r'Path where you want to store the exported CSV file\File Name.csv')
# df.to_csv('file_name.csv', encoding='utf-8', index=False)
print (df)

data[['column1','column2','column3',...]].to_csv('fileNameWhereYouwantToWrite.csv')
     
df = pd.DataFrame()
for i in range():
	#....
	df.appen(text)

#py #dataframe #pandas #combine #column

Combine columns in dataframe

# best way
data['resume'] = data[['Resume_title', 'City', 'State', 'Description', 'work_experiences', 'Educations', 'Skills', 'Certificates', 'Additional Information']].agg(' '.join, axis=1)


# other way
df["period"] = df["Year"] + df["quarter"]
df['Period'] = df['Year'] + ' ' + df['Quarter']
df["period"] = df["Year"].astype(str) + df["quarter"] #If one (or both) of the columns are not string typed
#Beware of NaNs when doing this!
df['period'] = df[['Year', 'quarter', ...]].agg('-'.join, axis=1) #for multiple string columns
df['period'] = df[['Year', 'quarter']].apply(lambda x: ''.join(x), axis=1)
#method cat() of the .str accessor 
df['Period'] = df.Year.str.cat(df.Quarter)
df['Period'] = df.Year.astype(str).str.cat(df.Quarter.astype(str), sep='q')
df['AllTogether'] = df['Country'].str.cat(df[['State', 'City']], sep=' - ') #add parameter na_rep to replace the NaN values with a string if have nan
columns = ['whatever', 'columns', 'you', 'choose']
df['period'] = df[columns].astype(str).sum(axis=1)

#a function
def str_join(df, sep, *cols):
   ...:     from functools import reduce
   ...:     return reduce(lambda x, y: x.astype(str).str.cat(y.astype(str), sep=sep), 
   ...:                   [df[col] for col in cols])
   ...: 

In [4]: df['cat'] = str_join(df, '-', 'c0', 'c1', 'c2', 'c3')

#py #dataframe #pandas #replace

replace function in dataframe

for c in df_drop.columns:
    df_drop[c] = df_drop[c].str.replace('[^\w\s]+', '')
df_drop = df_drop.astype(str)
df_drop.head()

#py #dataframe #pandas

apply function and def

rmsval = df.loc[:, 'c1':'c4']
def getrms(row):  
  a = np.sqrt(sum(row**2/4))
  return a
df['rms'] = df.apply(getrms,axis=1)
df.head()

#py #dataframe #pandas

create and save dataframe to csv

import pandas as pd

data = {'Product': ['Desktop Computer','Tablet','Printer','Laptop'],
        'Price': [850,200,150,1300]
        }

df = pd.DataFrame(data, columns= ['Product', 'Price'])

df.to_csv(r'Path where you want to store the exported CSV file\File Name.csv')

print (df)

#py #dataframe #pandas #text #exception

Exception and search text between 2 strings

import re

text = 'this is a text'

try:
    found = re.search('is(.+?)text', text).group(1)
except AttributeError:
    # AAA, ZZZ not found in the original string
    found = '0 wtitle' # apply your error handling
found

=> a

#py #dataframe #pandas

simply-create-dataframe

import pandas as pd, re

junk = """Shot - Wounded/Injured, Shot - Dead (murder, accidental, suicide), Suicide - Attempt, Murder/Suicide, Attempted Murder/Suicide (one variable unsuccessful), Institution/Group/Business, Mass Murder (4+ deceased victims excluding the subject/suspect/perpetrator , one location), Mass Shooting (4+ victims injured or killed excluding the subject/suspect"""

rx = re.compile(r'\([^()]+\)|,(\s+)')

data = [x 
        for nugget in rx.split(junk) if nugget
        for x in [nugget.strip()] if x]

df = pd.DataFrame({'incident_characteristics': data})
print(df)

#pyspark #unique #count #dataframe #column

pyspark dataframe column unique count

trx_1.select(f.countDistinct("stg_nexus_member_cd")).show()

Unique Values in Pandas

Concate dataframe in for loop

Cross Join in Pandas

brondata naar pandas dataframe

Adding new column to a dataframe

loc and iloc - part 3

loc and iloc - part 2

loc and iloc in Pandas

Square Bracket in Pandas

CSV to DataFrame (2)

create and save dataframe to csv

Combine columns in dataframe

replace function in dataframe

apply function and def

create and save dataframe to csv

Exception and search text between 2 strings

simply-create-dataframe

pyspark dataframe column unique count

Save snippets that work with our extensions