Purpose

I’m putting together a list of helpful code snippets and links for using python for data analysis and data science. I’d like to think I’m not horrible at python, but I seem to keep googling the same documentation over and over.

Table of Contents

Import Header

# General
## Dates and Times
import pytz
from dateutil import parser
from datetime import timezone,datetime,timedelta
from dateutil.relativedelta import relativedelta

## String manipulation
import string
import re #regex
import json #json parsing
## Data Generation
import random

# Data Manipulation
import pandas as pd
import numpy as np
## Vector Math
from numpy.linalg import norm
## Math
import math

# Data Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import plotly as pl
import plotly.express as px

# Data Science
import scipy as sp
import scipy.stats
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
## Linear Regression
from sklearn.linear_model import LinearRegression
## ARMA
from statsmodels.graphics.tsaplots import plot_pacf
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.tsa.arima_process import ArmaProcess
from statsmodels.stats.diagnostic import acorr_ljungbox
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.stattools import pacf
from statsmodels.tsa.stattools import acf
## Nearest Neighbor
from sklearn.neighbors import NearestNeighbors
#import annoy #Aprox Nearest Neighbors
## NPL
#import spacy
#nlp = spacy.load('en_core_web_md')
#doc = nlp(open("txtfile.txt").read()) #Importing NPL txt file

# Scraper
import requests
from requests import get
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
---------------------------------------------------------------------------

ModuleNotFoundError                       Traceback (most recent call last)

~\AppData\Local\Temp/ipykernel_19560/3502506230.py in <module>
     24 import matplotlib as mpl
     25 import matplotlib.pyplot as plt
---> 26 import plotly as pl
     27 import plotly.express as px
     28 


ModuleNotFoundError: No module named 'plotly'

Reading Data

CSV

read_table - Pandas Documentation Link

read_csv - Pandas Documentation Link

read_”“(filepath, sep, delimiter, header, names, index_col)

df1 = pd.read_table('test.csv', sep=',')
df2 = pd.read_csv('test.csv')

print(df1)
  col1 col2 col3 col4     col5
0    1    2    3    z    10.00
1    a    b    c    Z    12.50
2    A    B    C    z  1282.38

Copying Data

df1.copy()
col1 col2 col3 col4 col5
0 1 2 3 z 10.00
1 a b c Z 12.50
2 A B C z 1282.38

Viewing Data

View Table

# Print Formatting
print(df1)
  col1 col2 col3 col4     col5
0    1    2    3    z    10.00
1    a    b    c    Z    12.50
2    A    B    C    z  1282.38
# Top Rows
df1.head()
col1 col2 col3 col4 col5
0 1 2 3 z 10.00
1 a b c Z 12.50
2 A B C z 1282.38

View Table Attributes

# Column Names, Data Types
df1.columns
Index(['col1', 'col2', 'col3', 'col4', 'col5'], dtype='object')
# Specific Data Types
df1.dtypes
col1     object
col2     object
col3     object
col4     object
col5    float64
dtype: object
# Row Count
df1.shape[0]
len(df1.index)
3

View Specific Column

df1['col1']
0    1
1    a
2    A
Name: col1, dtype: object

View Specific Row

# Row by Index
df1.loc[[1]]
col1 col2 col3 col4 col5
1 a b c Z 12.5
# Row by Value
df1.loc[df1['col1'] == '1']
col1 col2 col3 col4 col5
0 1 2 3 z 10.0

View Specific Cell

# Named Column by Row Index
df1['col1'].values[0]
'1'

View Range of Rows/Columns

# Range of Rows
df1[0:2]
col1 col2 col3 col4 col5
0 1 2 3 z 10.0
1 a b c Z 12.5
# Range of Rows and Columns
df1[0:1][['col1','col2']]
col1 col2
0 1 2

Formatting DataFrame

Axis Names

set_axis - Pandas Documentation Link

rename - Pandas Documentation Link

reset_axis - Pandas Documentation Link

stack - Pandas Documentation Link

# All column/row names
df3 = df1.copy()
df3.set_axis(['col_a', 'col_b', 'col_c', 'col_d', 'col_e'], axis=1, inplace=True) #axis=0 is x axis
print(df3.columns)
Index(['col_a', 'col_b', 'col_c', 'col_d', 'col_e'], dtype='object')
# Select column/row names
df4 = df1.rename(columns = {'col1':'col_a', 'col2':'col_b'})
print(df4.columns)
Index(['col_a', 'col_b', 'col3', 'col4', 'col5'], dtype='object')

Set and Reset Index

set_index - Pandas Documentation Link

reset_index - Pandas Documentation Link

df3 = df3.set_index('col_a')
df3
col_b col_c col_d col_e
col_a
1 2 3 z 10.00
a b c Z 12.50
A B C z 1282.38
df3 = df3.reset_index()
df3
col_a col_b col_c col_d col_e
0 1 2 3 z 10.00
1 a b c Z 12.50
2 A B C z 1282.38
pd.DataFrame(df3.set_index(['col_a','col_d']).stack())
0
col_a col_d
1 z col_b 2
col_c 3
col_e 10.0
a Z col_b b
col_c c
col_e 12.5
A z col_b B
col_c C
col_e 1282.38

Change Data Type

astype - Pandas Documentation Link

# On Dataframe
df1.astype({
    'col4': 'string',
    'col5': 'int',
    'col1': 'int'
}, errors='ignore')
col1 col2 col3 col4 col5
0 1 2 3 z 10
1 a b c Z 12
2 A B C z 1282
# On Column

df1['col4'].astype('string')
0    z
1    Z
2    z
Name: col4, dtype: string

Drop Columns

df4 = df4.drop(columns = ['col_a'])
df4
col_b col3 col4 col5
0 2 3 z 10.00
1 b c Z 12.50
2 B C z 1282.38
df4 = df1.rename(columns = {'col1':'col_a'})
df5 = df4[['col3', 'col2']].copy()
df5
col3 col2
0 3 2
1 c b
2 C B

Iterate Over Column in New Column

# Example: Multiplication
df5 = df1.copy()
df5['new1'] = df5['col4'] * 10
df5['new2'] = df5['col5'] * 10
df5
col1 col2 col3 col4 col5 new1 new2
0 1 2 3 z 10.00 zzzzzzzzzz 100.0
1 a b c Z 12.50 ZZZZZZZZZZ 125.0
2 A B C z 1282.38 zzzzzzzzzz 12823.8
# Example: Concatinate
df5['new3'] = df5['col1'] + '-' + df5['col2']
df5
col1 col2 col3 col4 col5 new1 new2 new3
0 1 2 3 z 10.00 zzzzzzzzzz 100.0 1-2
1 a b c Z 12.50 ZZZZZZZZZZ 125.0 a-b
2 A B C z 1282.38 zzzzzzzzzz 12823.8 A-B
# Example: Capitalizing String
str1 = df5['new1'].values[0]
print(str1.upper())
ZZZZZZZZZZ
# Example: Trying to Capitalize Column (does not work)
df5['cap'] = df5['new1'].upper()
---------------------------------------------------------------------------

AttributeError                            Traceback (most recent call last)

Input In [26], in <cell line: 2>()
      1 # Example: Trying to Capitalize Column (does not work)
----> 2 df5['cap'] = df5['new1'].upper()


File ~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/generic.py:5575, in NDFrame.__getattr__(self, name)
   5568 if (
   5569     name not in self._internal_names_set
   5570     and name not in self._metadata
   5571     and name not in self._accessors
   5572     and self._info_axis._can_hold_identifiers_and_holds_name(name)
   5573 ):
   5574     return self[name]
-> 5575 return object.__getattribute__(self, name)


AttributeError: 'Series' object has no attribute 'upper'
# Lambda Function
df5['cap'] = df5['new1'].apply(lambda x: x.upper())
df5

Dataframe SQL Like Functions

Filtering

# Example: Filter by Column
df6 = df5.loc[df5['col5'] >= 12.5]
df6
col1 col2 col3 col4 col5 new1 new2 new3
1 a b c Z 12.50 ZZZZZZZZZZ 125.0 a-b
2 A B C z 1282.38 zzzzzzzzzz 12823.8 A-B
# Example: Filter by Column
df6 = df5.loc[df5['new3'].str.contains(pat = r'a|B')]
df6
col1 col2 col3 col4 col5 new1 new2 new3
1 a b c Z 12.50 ZZZZZZZZZZ 125.0 a-b
2 A B C z 1282.38 zzzzzzzzzz 12823.8 A-B

Aggregating

Pandas Documentation Link

groupby - Pandas Documentation Link

grouper - Pandas Documentation Link, helpful for creating time series using “freq” param

Usable Functions - mean, median, sum

# Setup
d = {'col1': range(0,42,2), 'col2': range(10,220,10), 'col3': ('t','t','t','t','t','t','t','t','t','t','f','f','f','f','f','f','f','f','f','f','f')}
df2 = pd.DataFrame(data=d)
df2
col1 col2 col3
0 0 10 t
1 2 20 t
2 4 30 t
3 6 40 t
4 8 50 t
5 10 60 t
6 12 70 t
7 14 80 t
8 16 90 t
9 18 100 t
10 20 110 f
11 22 120 f
12 24 130 f
13 26 140 f
14 28 150 f
15 30 160 f
16 32 170 f
17 34 180 f
18 36 190 f
19 38 200 f
20 40 210 f
# Aggregate Mean
df3 = df2.groupby(['col3'], axis=0).mean()
df3
col1 col2
col3
f 30.0 160.0
t 9.0 55.0
# Aggregate Sum (without NaN values)
df3 = df2.groupby(['col3'], axis=0).sum().dropna()
df3
col1 col2
col3
f 330 1760
t 90 550
# Aggregate by different functions
df3 = df2.groupby(['col3'], axis=0).agg({'col1':'sum', 'col2':'mean'})
df3
col1 col2
col3
f 330 160.0
t 90 55.0

Merging

merge - Pandas Documentation Link

Best when only joining for a few columns.

left.merge(right, how=’inner’, on=’key’, left_on=’left_key’, right_on=’right_key’, suffixes=(‘_x’,’_y’), copy=True)

or

pd.merge(left, right[[‘key’, ‘col1’]], “….”)

d = {'col1': ('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'), 'col2': range(0,7,1), 'col3': ('t','t','t','t','t','t','t')}
left = pd.DataFrame(data=d)

d = {'col1': ('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'), 'col2': range(11,18,1), 'col3': ('f','f','f','f','f','f','f')}
right = pd.DataFrame(data=d)

# Full Merge
df1 = left.merge(right, how='inner', on='col1', suffixes=('_x','_y'), copy=False)
df1
col1 col2_x col3_x col2_y col3_y
0 Monday 0 t 11 f
1 Tuesday 1 t 12 f
2 Wednesday 2 t 13 f
3 Thursday 3 t 14 f
4 Friday 4 t 15 f
5 Saturday 5 t 16 f
6 Sunday 6 t 17 f
# Merge Certain Columns
df2 = pd.merge(left, right[['col1', 'col2']], on='col1', suffixes=('_x','_y'))
df2
col1 col2_x col3 col2_y
0 Monday 0 t 11
1 Tuesday 1 t 12
2 Wednesday 2 t 13
3 Thursday 3 t 14
4 Friday 4 t 15
5 Saturday 5 t 16
6 Sunday 6 t 17

Joining

join - Pandas Documentation Link

Best when joining full datasets.

left.join(right, on=’key_index_right’, how=’’, lsuffix=’’, rsuffix=’’)

d = {'col1': ('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'), 'col2': range(0,7,1), 'col3': ('t','t','t','t','t','t','t')}
left = pd.DataFrame(data=d)
left = left.set_index('col1')

d = {'col1': ('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'), 'col2': range(11,18,1), 'col3': ('f','f','f','f','f','f','f')}
right = pd.DataFrame(data=d)
right = right.set_index('col1')

df1 = left.join(right, how='left', on='col1', lsuffix='_left', rsuffix='_right')
df1
col2_left col3_left col2_right col3_right
col1
Monday 0 t 11 f
Tuesday 1 t 12 f
Wednesday 2 t 13 f
Thursday 3 t 14 f
Friday 4 t 15 f
Saturday 5 t 16 f
Sunday 6 t 17 f

Case When

when - Numpy Documentation Link

Best when creating new columns that are not caluclated from other columns, or that take on values different than the columns used

d = {'col1': ('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'), 'col2': range(0,7,1), 'col3': ('t','t','t','t','t','t','t')}
df1 = pd.DataFrame(data=d)

df1['Monday'] = np.where(df1['col1'] == 'Monday', 1, 0)

df1
col1 col2 col3 Monday
0 Monday 0 t 1
1 Tuesday 1 t 0
2 Wednesday 2 t 0
3 Thursday 3 t 0
4 Friday 4 t 0
5 Saturday 5 t 0
6 Sunday 6 t 0

Window Functions

Window functions work a bit differently than SQL. It is preferable to window in SQL and further manipulate in python than try to window in python.

groupby - Pandas Documentation Link

shift - Pandas Documentation Link

cumsum - Pandas Documentation Link

rolling - Pandas Documentation Link

d = {'col1': ('2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05', '2023-01-06', '2023-01-07'), 'col2': range(0,7,1), 'col3': ('a','a','a','b','b','b','b')}
df1 = pd.DataFrame(data=d)
df1['col1'] = pd.to_datetime(df1['col1'])

# Shift by a period (date implied)
df1['col2_shift'] = df1.groupby(['col3'], as_index=False)['col2'].shift(periods=1)

# Cumsum function
df1[['col2_roll']] = df1.groupby(['col3'], as_index=False)['col2'].cumsum()
df1[['col2_roll2']] = df1[['col2']].cumsum()

# Window function
df1 = df1.merge(df1.groupby(['col3'], as_index=False)[['col1','col2']].rolling(2, on='col1', method='single').sum(), on=['col1','col3'], how='left')
df1 = df1.rename(columns={'col2_x':'col2','col2_y':'col2_window'})

df1
col1 col2 col3 col2_shift col2_roll col2_roll2 col2_window
0 2023-01-01 0 a NaN 0 0 NaN
1 2023-01-02 1 a 0.0 1 1 1.0
2 2023-01-03 2 a 1.0 3 3 3.0
3 2023-01-04 3 b NaN 3 6 NaN
4 2023-01-05 4 b 3.0 7 10 7.0
5 2023-01-06 5 b 4.0 12 15 9.0
6 2023-01-07 6 b 5.0 18 21 11.0

Extracting

Column to List

# Extract First Column (two ways)
ary1 = df5['col1'].to_numpy()
ary2 = np.array(df5['col1'])
ary1, ary2
---------------------------------------------------------------------------

NameError                                 Traceback (most recent call last)

Input In [4], in <cell line: 2>()
      1 # Extract First Column (two ways)
----> 2 ary1 = df5['col1'].to_numpy()
      3 ary2 = np.array(df5['col1'])
      4 ary1, ary2


NameError: name 'df5' is not defined
# Extract All Columns
ary1 = np.array(df5.loc[0:, :])
ary1

Row to List

# Extract First Row
ary1 = np.array(df5[0:1])
ary1
# Extract All Rows
ary1 = np.array(df5[0:])
ary1

Helpful Functions

Strings

regex - Python Documentation Link

# String Replacement
str1 = 'Hey Look A Dog'
str2 = str1.replace('Dog', 'Cat')
print(str2)

str3 = 'slash \\'
print(str3)
str3 = str3.replace('\\', '')
print(str3)
Hey Look A Cat
slash \
slash 
# String Contains Item
str1 = 'Hey Look A Dog'
str2 = 'Hey Look A Cat'
str3 = 'Dogs Are Cool'
str4 = 'Cats Are Cool'

print('Dog' in str1,',', 'Dog' in str2,',', 'dog' in str3,',', 'dog' in str4)
True , False , False , False
# String Contains df
d = {'col1': [1, 2, 3, 4], 'col2': [str1, str2, str3, str4]}
df2 = pd.DataFrame(data=d)
df2['tf'] = (df2['col2'].str.contains(pat = 'Dog'))
df2
col1 col2 tf
0 1 Hey Look A Dog True
1 2 Hey Look A Cat False
2 3 Dogs Are Cool True
3 4 Cats Are Cool False
# Regex
print(re.search(r'^D', str1),',', re.search(r'^D', str3),','
      , re.findall(r'^D', str1),',', re.findall(r'^D', str3))
None , <re.Match object; span=(0, 1), match='D'> , [] , ['D']
# Pattern Finding
str1 = 'Thanksgiving this year is 11/24/2022'
str2 = re.findall('[0-9]{2}\/[0-9]{2}\/[0-9]{4}', str1)
print(str2[0])
11/24/2022
# String Split
str1 = '01/01/2022'
l = str1.split('/')
print(l)
['01', '01', '2022']
# Finding if Item in String List is in String
str1 = 'Where is the Store'
str2 = 'Hey Look A Cat'
str3 = 'Dogs Are Cool'
str4 = 'Who is That'
str5 = 'Frogs are the Best'
str6 = 'Llamas are Fun'
str7 = 'I like Clouds'
str8 = 'Bats are Awesome'
str9 = 'Dog Dog Dog'
str10 = 'Hello'

animals = ['Dog','Cat','Bat','Frog','Llama']

d = {'col1': range(1,11,1), 'col2': [str1, str2, str3, str4, str5, str6, str7, str8, str9, str10]}
df8 = pd.DataFrame(data=d)
df8['animal_tf'] = df8['col2'].apply(lambda x: any(y.upper() in x.upper() for y in animals))
df8['animal_listed'] = df8['col2'].apply(lambda x: animals[[y.upper() in x.upper() for y in animals].index(True)] if [y.upper() in x.upper() for y in animals].count(True) > 0 else '')
df8
col1 col2 animal_tf animal_listed
0 1 Where is the Store False
1 2 Hey Look A Cat True Cat
2 3 Dogs Are Cool True Dog
3 4 Who is That False
4 5 Frogs are the Best True Frog
5 6 Llamas are Fun True Llama
6 7 I like Clouds False
7 8 Bats are Awesome True Bat
8 9 Dog Dog Dog True Dog
9 10 Hello False

Vectors

# Vector Manipulation, Addinv Vectors to Vectors
v1 = np.array(['#1a2b3c', '#abcdef', '#000001'])
v2 = []
v3 = []

for i in v1:
    y = i.lstrip('#')
    v2.append(y)
    
    z = np.array([int(y[:2],16), int(y[2:4],16), int(y[4:6],16)])
    v3.append(z)

v2,v3

DataFrames

# Fill NaN
df1 = pd.DataFrame(np.nan, index=[0, 1, 2, 3], columns=['A', 'B'])
print(df1)
df2 = df1.fillna(value = '0.0')
print(df2)
    A   B
0 NaN NaN
1 NaN NaN
2 NaN NaN
3 NaN NaN
     A    B
0  0.0  0.0
1  0.0  0.0
2  0.0  0.0
3  0.0  0.0

Math

Math - Python Documentation Link

# Numpy Vector Math
v1 = np.array([1,2,3])
v2 = np.array([2,3,4])

print(v1 * v2)
print(v2 / v1)
# Math functions
print(math.sqrt(v1[0]))

print(math.factorial(2))

print(math.fmod(19, 5))

Dates and Times

Pandas Documentation

Datetime Parts

# String to Datetime
datetime = pd.to_datetime('01/01/2022')
current_date = datetime.now() # current date

# Datetime Oject
'datetime64[ns]'

# Column to Date
pd.to_datetime(table['datetime_col'].apply(lambda x: x.date()))
pd.to_datetime(table['date'], format = '%b/%d/%Y')

# Time Delta (difference between two days)
numofdays(date2, date1)

c = date2 - date1
min_diff = c.seconds / 60 

# Extract Date Parts
datetime.year
datetime.month
datetime.day
datetime.date() # no time

# Format datetime
datetime.strftime('%Y-%m-%d') # YYYY-mm-dd format

# Add time to a date
next_year = current_date + relativedelta(years=1)

# ISO Week Month Function
def week_month(date):
    date = pd.to_datetime(date)
    year = date.strftime('%G')
    week = date.strftime('%V')
    start_date = pd.to_datetime(str(week) + '-Mon-' + str(year), format='%V-%a-%G')
    return start_date

Plots and Figures

Machine Learning

Scikit Learn Documentation

Online ML Guide

Exploration

# Corr Matrix
corr_matrix = ml_df.corr()
corr_matrix['y_col'].sort_values(ascending = False)

# Histograms
ml_df.hist(bins=50, figzise = (50,50))

ML Pipeline

# Seperate Y
ml_df_y = ml_df['y_col'].copy()

# Fill N/A
ml_df['column1'].fillna('None', inplace=True)

column2_med = ml_df['column2'].median 
ml_df['column2'].fillna(column2_med, inplace=True)

# Additional Columns
ml_df['extra_col'] = ml_df['column3'] / ml_df['column4']

# Drop Columns
ml_df = ml_df.drop(columns ={'bad_col'})

# Pipeline
cat_attribs = ['column1']
num_attribs = ['column2', 'column3', 'column4', 'extra_col']

num_pipeline = Pipeline([
    ('std_scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('ohe', OneHotEncoder())
])

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', cat_pipeline, cat_attribs),
    
])

ml_df_matrix = full_pipeline.fit_transform(ml_df)

Train Test Split

# Split

train_set_x, test_set_x, train_set_y, test_set_y = train_test_split(ml_df_matrix, ml_df_y, test_size=0.2, random_state=3)

# Feature Names
pipleline = make_pipeline(full_pipeline, LinearRegression())
pipeline.fit(ml_df, ml_df['y_col'])
feature_names = pipeline[:-1].get_feature_names_out()

Average Model

Compare a model based on the difference between the predicted overall average. Useful in comparing against different models. Calculates the Root Mean Squared Error.

test_set_y = np.array(test_set_y, dtype=float)
train_set_y = np.array(train_set_y, dtype=float)

train_set_mean = train_set_y.mean()
avg_array = [train_set_mean] * len(test_set_y)

avg_mse = mean_squared_error(test_set_y, avg_array)
avg_rmse = np.sqrt(avg_mse)

Example: Random Forest Regressor

An ML example, using the Random Forest Regressor model. Model is compared against the Root Mean Squared Error.

# Train
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(train_set_x, train_set_y)

# Evaluate
test_set_y = np.array(test_set_y, dtype=float)
predict = forest_reg.predict(test_set_y)
forest_mse = mean_squared_error(test_set_y, predict)

Models

SciKit Learn

Supervised:

Unsupervised:

Statsmodels

Other Resources

DateTime Pieces

Nearest Neighbors with Sentences

Random Forest

Time Series Regression