Python Quick Reference Guide
Purpose
I’m putting together a list of helpful code snippets and links for using python for data analysis and data science. I’d like to think I’m not horrible at python, but I seem to keep googling the same documentation over and over.
Table of Contents
- Import Header
- Reading Data
- Viewing Data
- Formatting DataFrame
- Dataframe SQL Like Functions
- Extracting
- Helpful Functions
- Machine Learning
- Other Resources
Import Header
# General
## Dates and Times
import pytz
from dateutil import parser
from datetime import timezone,datetime,timedelta
from dateutil.relativedelta import relativedelta
## String manipulation
import string
import re #regex
import json #json parsing
## Data Generation
import random
# Data Manipulation
import pandas as pd
import numpy as np
## Vector Math
from numpy.linalg import norm
## Math
import math
# Data Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import plotly as pl
import plotly.express as px
# Data Science
import scipy as sp
import scipy.stats
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
## Linear Regression
from sklearn.linear_model import LinearRegression
## ARMA
from statsmodels.graphics.tsaplots import plot_pacf
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.tsa.arima_process import ArmaProcess
from statsmodels.stats.diagnostic import acorr_ljungbox
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.stattools import pacf
from statsmodels.tsa.stattools import acf
## Nearest Neighbor
from sklearn.neighbors import NearestNeighbors
#import annoy #Aprox Nearest Neighbors
## NPL
#import spacy
#nlp = spacy.load('en_core_web_md')
#doc = nlp(open("txtfile.txt").read()) #Importing NPL txt file
# Scraper
import requests
from requests import get
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_19560/3502506230.py in <module>
24 import matplotlib as mpl
25 import matplotlib.pyplot as plt
---> 26 import plotly as pl
27 import plotly.express as px
28
ModuleNotFoundError: No module named 'plotly'
Reading Data
CSV
read_table - Pandas Documentation Link
read_csv - Pandas Documentation Link
read_”“(filepath, sep, delimiter, header, names, index_col)
df1 = pd.read_table('test.csv', sep=',')
df2 = pd.read_csv('test.csv')
print(df1)
col1 col2 col3 col4 col5
0 1 2 3 z 10.00
1 a b c Z 12.50
2 A B C z 1282.38
Copying Data
df1.copy()
| col1 | col2 | col3 | col4 | col5 | |
|---|---|---|---|---|---|
| 0 | 1 | 2 | 3 | z | 10.00 |
| 1 | a | b | c | Z | 12.50 |
| 2 | A | B | C | z | 1282.38 |
Viewing Data
View Table
# Print Formatting
print(df1)
col1 col2 col3 col4 col5
0 1 2 3 z 10.00
1 a b c Z 12.50
2 A B C z 1282.38
# Top Rows
df1.head()
| col1 | col2 | col3 | col4 | col5 | |
|---|---|---|---|---|---|
| 0 | 1 | 2 | 3 | z | 10.00 |
| 1 | a | b | c | Z | 12.50 |
| 2 | A | B | C | z | 1282.38 |
View Table Attributes
# Column Names, Data Types
df1.columns
Index(['col1', 'col2', 'col3', 'col4', 'col5'], dtype='object')
# Specific Data Types
df1.dtypes
col1 object
col2 object
col3 object
col4 object
col5 float64
dtype: object
# Row Count
df1.shape[0]
len(df1.index)
3
View Specific Column
df1['col1']
0 1
1 a
2 A
Name: col1, dtype: object
View Specific Row
# Row by Index
df1.loc[[1]]
| col1 | col2 | col3 | col4 | col5 | |
|---|---|---|---|---|---|
| 1 | a | b | c | Z | 12.5 |
# Row by Value
df1.loc[df1['col1'] == '1']
| col1 | col2 | col3 | col4 | col5 | |
|---|---|---|---|---|---|
| 0 | 1 | 2 | 3 | z | 10.0 |
View Specific Cell
# Named Column by Row Index
df1['col1'].values[0]
'1'
View Range of Rows/Columns
# Range of Rows
df1[0:2]
| col1 | col2 | col3 | col4 | col5 | |
|---|---|---|---|---|---|
| 0 | 1 | 2 | 3 | z | 10.0 |
| 1 | a | b | c | Z | 12.5 |
# Range of Rows and Columns
df1[0:1][['col1','col2']]
| col1 | col2 | |
|---|---|---|
| 0 | 1 | 2 |
Formatting DataFrame
Axis Names
set_axis - Pandas Documentation Link
rename - Pandas Documentation Link
reset_axis - Pandas Documentation Link
stack - Pandas Documentation Link
# All column/row names
df3 = df1.copy()
df3.set_axis(['col_a', 'col_b', 'col_c', 'col_d', 'col_e'], axis=1, inplace=True) #axis=0 is x axis
print(df3.columns)
Index(['col_a', 'col_b', 'col_c', 'col_d', 'col_e'], dtype='object')
# Select column/row names
df4 = df1.rename(columns = {'col1':'col_a', 'col2':'col_b'})
print(df4.columns)
Index(['col_a', 'col_b', 'col3', 'col4', 'col5'], dtype='object')
Set and Reset Index
set_index - Pandas Documentation Link
reset_index - Pandas Documentation Link
df3 = df3.set_index('col_a')
df3
| col_b | col_c | col_d | col_e | |
|---|---|---|---|---|
| col_a | ||||
| 1 | 2 | 3 | z | 10.00 |
| a | b | c | Z | 12.50 |
| A | B | C | z | 1282.38 |
df3 = df3.reset_index()
df3
| col_a | col_b | col_c | col_d | col_e | |
|---|---|---|---|---|---|
| 0 | 1 | 2 | 3 | z | 10.00 |
| 1 | a | b | c | Z | 12.50 |
| 2 | A | B | C | z | 1282.38 |
pd.DataFrame(df3.set_index(['col_a','col_d']).stack())
| 0 | |||
|---|---|---|---|
| col_a | col_d | ||
| 1 | z | col_b | 2 |
| col_c | 3 | ||
| col_e | 10.0 | ||
| a | Z | col_b | b |
| col_c | c | ||
| col_e | 12.5 | ||
| A | z | col_b | B |
| col_c | C | ||
| col_e | 1282.38 |
Change Data Type
astype - Pandas Documentation Link
# On Dataframe
df1.astype({
'col4': 'string',
'col5': 'int',
'col1': 'int'
}, errors='ignore')
| col1 | col2 | col3 | col4 | col5 | |
|---|---|---|---|---|---|
| 0 | 1 | 2 | 3 | z | 10 |
| 1 | a | b | c | Z | 12 |
| 2 | A | B | C | z | 1282 |
# On Column
df1['col4'].astype('string')
0 z
1 Z
2 z
Name: col4, dtype: string
Drop Columns
df4 = df4.drop(columns = ['col_a'])
df4
| col_b | col3 | col4 | col5 | |
|---|---|---|---|---|
| 0 | 2 | 3 | z | 10.00 |
| 1 | b | c | Z | 12.50 |
| 2 | B | C | z | 1282.38 |
df4 = df1.rename(columns = {'col1':'col_a'})
df5 = df4[['col3', 'col2']].copy()
df5
| col3 | col2 | |
|---|---|---|
| 0 | 3 | 2 |
| 1 | c | b |
| 2 | C | B |
Iterate Over Column in New Column
# Example: Multiplication
df5 = df1.copy()
df5['new1'] = df5['col4'] * 10
df5['new2'] = df5['col5'] * 10
df5
| col1 | col2 | col3 | col4 | col5 | new1 | new2 | |
|---|---|---|---|---|---|---|---|
| 0 | 1 | 2 | 3 | z | 10.00 | zzzzzzzzzz | 100.0 |
| 1 | a | b | c | Z | 12.50 | ZZZZZZZZZZ | 125.0 |
| 2 | A | B | C | z | 1282.38 | zzzzzzzzzz | 12823.8 |
# Example: Concatinate
df5['new3'] = df5['col1'] + '-' + df5['col2']
df5
| col1 | col2 | col3 | col4 | col5 | new1 | new2 | new3 | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2 | 3 | z | 10.00 | zzzzzzzzzz | 100.0 | 1-2 |
| 1 | a | b | c | Z | 12.50 | ZZZZZZZZZZ | 125.0 | a-b |
| 2 | A | B | C | z | 1282.38 | zzzzzzzzzz | 12823.8 | A-B |
# Example: Capitalizing String
str1 = df5['new1'].values[0]
print(str1.upper())
ZZZZZZZZZZ
# Example: Trying to Capitalize Column (does not work)
df5['cap'] = df5['new1'].upper()
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
Input In [26], in <cell line: 2>()
1 # Example: Trying to Capitalize Column (does not work)
----> 2 df5['cap'] = df5['new1'].upper()
File ~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/generic.py:5575, in NDFrame.__getattr__(self, name)
5568 if (
5569 name not in self._internal_names_set
5570 and name not in self._metadata
5571 and name not in self._accessors
5572 and self._info_axis._can_hold_identifiers_and_holds_name(name)
5573 ):
5574 return self[name]
-> 5575 return object.__getattribute__(self, name)
AttributeError: 'Series' object has no attribute 'upper'
# Lambda Function
df5['cap'] = df5['new1'].apply(lambda x: x.upper())
df5
Dataframe SQL Like Functions
Filtering
# Example: Filter by Column
df6 = df5.loc[df5['col5'] >= 12.5]
df6
| col1 | col2 | col3 | col4 | col5 | new1 | new2 | new3 | |
|---|---|---|---|---|---|---|---|---|
| 1 | a | b | c | Z | 12.50 | ZZZZZZZZZZ | 125.0 | a-b |
| 2 | A | B | C | z | 1282.38 | zzzzzzzzzz | 12823.8 | A-B |
# Example: Filter by Column
df6 = df5.loc[df5['new3'].str.contains(pat = r'a|B')]
df6
| col1 | col2 | col3 | col4 | col5 | new1 | new2 | new3 | |
|---|---|---|---|---|---|---|---|---|
| 1 | a | b | c | Z | 12.50 | ZZZZZZZZZZ | 125.0 | a-b |
| 2 | A | B | C | z | 1282.38 | zzzzzzzzzz | 12823.8 | A-B |
Aggregating
groupby - Pandas Documentation Link
grouper - Pandas Documentation Link, helpful for creating time series using “freq” param
Usable Functions - mean, median, sum
# Setup
d = {'col1': range(0,42,2), 'col2': range(10,220,10), 'col3': ('t','t','t','t','t','t','t','t','t','t','f','f','f','f','f','f','f','f','f','f','f')}
df2 = pd.DataFrame(data=d)
df2
| col1 | col2 | col3 | |
|---|---|---|---|
| 0 | 0 | 10 | t |
| 1 | 2 | 20 | t |
| 2 | 4 | 30 | t |
| 3 | 6 | 40 | t |
| 4 | 8 | 50 | t |
| 5 | 10 | 60 | t |
| 6 | 12 | 70 | t |
| 7 | 14 | 80 | t |
| 8 | 16 | 90 | t |
| 9 | 18 | 100 | t |
| 10 | 20 | 110 | f |
| 11 | 22 | 120 | f |
| 12 | 24 | 130 | f |
| 13 | 26 | 140 | f |
| 14 | 28 | 150 | f |
| 15 | 30 | 160 | f |
| 16 | 32 | 170 | f |
| 17 | 34 | 180 | f |
| 18 | 36 | 190 | f |
| 19 | 38 | 200 | f |
| 20 | 40 | 210 | f |
# Aggregate Mean
df3 = df2.groupby(['col3'], axis=0).mean()
df3
| col1 | col2 | |
|---|---|---|
| col3 | ||
| f | 30.0 | 160.0 |
| t | 9.0 | 55.0 |
# Aggregate Sum (without NaN values)
df3 = df2.groupby(['col3'], axis=0).sum().dropna()
df3
| col1 | col2 | |
|---|---|---|
| col3 | ||
| f | 330 | 1760 |
| t | 90 | 550 |
# Aggregate by different functions
df3 = df2.groupby(['col3'], axis=0).agg({'col1':'sum', 'col2':'mean'})
df3
| col1 | col2 | |
|---|---|---|
| col3 | ||
| f | 330 | 160.0 |
| t | 90 | 55.0 |
Merging
merge - Pandas Documentation Link
Best when only joining for a few columns.
left.merge(right, how=’inner’, on=’key’, left_on=’left_key’, right_on=’right_key’, suffixes=(‘_x’,’_y’), copy=True)
or
pd.merge(left, right[[‘key’, ‘col1’]], “….”)
d = {'col1': ('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'), 'col2': range(0,7,1), 'col3': ('t','t','t','t','t','t','t')}
left = pd.DataFrame(data=d)
d = {'col1': ('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'), 'col2': range(11,18,1), 'col3': ('f','f','f','f','f','f','f')}
right = pd.DataFrame(data=d)
# Full Merge
df1 = left.merge(right, how='inner', on='col1', suffixes=('_x','_y'), copy=False)
df1
| col1 | col2_x | col3_x | col2_y | col3_y | |
|---|---|---|---|---|---|
| 0 | Monday | 0 | t | 11 | f |
| 1 | Tuesday | 1 | t | 12 | f |
| 2 | Wednesday | 2 | t | 13 | f |
| 3 | Thursday | 3 | t | 14 | f |
| 4 | Friday | 4 | t | 15 | f |
| 5 | Saturday | 5 | t | 16 | f |
| 6 | Sunday | 6 | t | 17 | f |
# Merge Certain Columns
df2 = pd.merge(left, right[['col1', 'col2']], on='col1', suffixes=('_x','_y'))
df2
| col1 | col2_x | col3 | col2_y | |
|---|---|---|---|---|
| 0 | Monday | 0 | t | 11 |
| 1 | Tuesday | 1 | t | 12 |
| 2 | Wednesday | 2 | t | 13 |
| 3 | Thursday | 3 | t | 14 |
| 4 | Friday | 4 | t | 15 |
| 5 | Saturday | 5 | t | 16 |
| 6 | Sunday | 6 | t | 17 |
Joining
join - Pandas Documentation Link
Best when joining full datasets.
left.join(right, on=’key_index_right’, how=’’, lsuffix=’’, rsuffix=’’)
d = {'col1': ('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'), 'col2': range(0,7,1), 'col3': ('t','t','t','t','t','t','t')}
left = pd.DataFrame(data=d)
left = left.set_index('col1')
d = {'col1': ('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'), 'col2': range(11,18,1), 'col3': ('f','f','f','f','f','f','f')}
right = pd.DataFrame(data=d)
right = right.set_index('col1')
df1 = left.join(right, how='left', on='col1', lsuffix='_left', rsuffix='_right')
df1
| col2_left | col3_left | col2_right | col3_right | |
|---|---|---|---|---|
| col1 | ||||
| Monday | 0 | t | 11 | f |
| Tuesday | 1 | t | 12 | f |
| Wednesday | 2 | t | 13 | f |
| Thursday | 3 | t | 14 | f |
| Friday | 4 | t | 15 | f |
| Saturday | 5 | t | 16 | f |
| Sunday | 6 | t | 17 | f |
Case When
when - Numpy Documentation Link
Best when creating new columns that are not caluclated from other columns, or that take on values different than the columns used
d = {'col1': ('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'), 'col2': range(0,7,1), 'col3': ('t','t','t','t','t','t','t')}
df1 = pd.DataFrame(data=d)
df1['Monday'] = np.where(df1['col1'] == 'Monday', 1, 0)
df1
| col1 | col2 | col3 | Monday | |
|---|---|---|---|---|
| 0 | Monday | 0 | t | 1 |
| 1 | Tuesday | 1 | t | 0 |
| 2 | Wednesday | 2 | t | 0 |
| 3 | Thursday | 3 | t | 0 |
| 4 | Friday | 4 | t | 0 |
| 5 | Saturday | 5 | t | 0 |
| 6 | Sunday | 6 | t | 0 |
Window Functions
Window functions work a bit differently than SQL. It is preferable to window in SQL and further manipulate in python than try to window in python.
groupby - Pandas Documentation Link
shift - Pandas Documentation Link
cumsum - Pandas Documentation Link
rolling - Pandas Documentation Link
d = {'col1': ('2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05', '2023-01-06', '2023-01-07'), 'col2': range(0,7,1), 'col3': ('a','a','a','b','b','b','b')}
df1 = pd.DataFrame(data=d)
df1['col1'] = pd.to_datetime(df1['col1'])
# Shift by a period (date implied)
df1['col2_shift'] = df1.groupby(['col3'], as_index=False)['col2'].shift(periods=1)
# Cumsum function
df1[['col2_roll']] = df1.groupby(['col3'], as_index=False)['col2'].cumsum()
df1[['col2_roll2']] = df1[['col2']].cumsum()
# Window function
df1 = df1.merge(df1.groupby(['col3'], as_index=False)[['col1','col2']].rolling(2, on='col1', method='single').sum(), on=['col1','col3'], how='left')
df1 = df1.rename(columns={'col2_x':'col2','col2_y':'col2_window'})
df1
| col1 | col2 | col3 | col2_shift | col2_roll | col2_roll2 | col2_window | |
|---|---|---|---|---|---|---|---|
| 0 | 2023-01-01 | 0 | a | NaN | 0 | 0 | NaN |
| 1 | 2023-01-02 | 1 | a | 0.0 | 1 | 1 | 1.0 |
| 2 | 2023-01-03 | 2 | a | 1.0 | 3 | 3 | 3.0 |
| 3 | 2023-01-04 | 3 | b | NaN | 3 | 6 | NaN |
| 4 | 2023-01-05 | 4 | b | 3.0 | 7 | 10 | 7.0 |
| 5 | 2023-01-06 | 5 | b | 4.0 | 12 | 15 | 9.0 |
| 6 | 2023-01-07 | 6 | b | 5.0 | 18 | 21 | 11.0 |
Extracting
Column to List
# Extract First Column (two ways)
ary1 = df5['col1'].to_numpy()
ary2 = np.array(df5['col1'])
ary1, ary2
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Input In [4], in <cell line: 2>()
1 # Extract First Column (two ways)
----> 2 ary1 = df5['col1'].to_numpy()
3 ary2 = np.array(df5['col1'])
4 ary1, ary2
NameError: name 'df5' is not defined
# Extract All Columns
ary1 = np.array(df5.loc[0:, :])
ary1
Row to List
# Extract First Row
ary1 = np.array(df5[0:1])
ary1
# Extract All Rows
ary1 = np.array(df5[0:])
ary1
Helpful Functions
Strings
regex - Python Documentation Link
# String Replacement
str1 = 'Hey Look A Dog'
str2 = str1.replace('Dog', 'Cat')
print(str2)
str3 = 'slash \\'
print(str3)
str3 = str3.replace('\\', '')
print(str3)
Hey Look A Cat
slash \
slash
# String Contains Item
str1 = 'Hey Look A Dog'
str2 = 'Hey Look A Cat'
str3 = 'Dogs Are Cool'
str4 = 'Cats Are Cool'
print('Dog' in str1,',', 'Dog' in str2,',', 'dog' in str3,',', 'dog' in str4)
True , False , False , False
# String Contains df
d = {'col1': [1, 2, 3, 4], 'col2': [str1, str2, str3, str4]}
df2 = pd.DataFrame(data=d)
df2['tf'] = (df2['col2'].str.contains(pat = 'Dog'))
df2
| col1 | col2 | tf | |
|---|---|---|---|
| 0 | 1 | Hey Look A Dog | True |
| 1 | 2 | Hey Look A Cat | False |
| 2 | 3 | Dogs Are Cool | True |
| 3 | 4 | Cats Are Cool | False |
# Regex
print(re.search(r'^D', str1),',', re.search(r'^D', str3),','
, re.findall(r'^D', str1),',', re.findall(r'^D', str3))
None , <re.Match object; span=(0, 1), match='D'> , [] , ['D']
# Pattern Finding
str1 = 'Thanksgiving this year is 11/24/2022'
str2 = re.findall('[0-9]{2}\/[0-9]{2}\/[0-9]{4}', str1)
print(str2[0])
11/24/2022
# String Split
str1 = '01/01/2022'
l = str1.split('/')
print(l)
['01', '01', '2022']
# Finding if Item in String List is in String
str1 = 'Where is the Store'
str2 = 'Hey Look A Cat'
str3 = 'Dogs Are Cool'
str4 = 'Who is That'
str5 = 'Frogs are the Best'
str6 = 'Llamas are Fun'
str7 = 'I like Clouds'
str8 = 'Bats are Awesome'
str9 = 'Dog Dog Dog'
str10 = 'Hello'
animals = ['Dog','Cat','Bat','Frog','Llama']
d = {'col1': range(1,11,1), 'col2': [str1, str2, str3, str4, str5, str6, str7, str8, str9, str10]}
df8 = pd.DataFrame(data=d)
df8['animal_tf'] = df8['col2'].apply(lambda x: any(y.upper() in x.upper() for y in animals))
df8['animal_listed'] = df8['col2'].apply(lambda x: animals[[y.upper() in x.upper() for y in animals].index(True)] if [y.upper() in x.upper() for y in animals].count(True) > 0 else '')
df8
| col1 | col2 | animal_tf | animal_listed | |
|---|---|---|---|---|
| 0 | 1 | Where is the Store | False | |
| 1 | 2 | Hey Look A Cat | True | Cat |
| 2 | 3 | Dogs Are Cool | True | Dog |
| 3 | 4 | Who is That | False | |
| 4 | 5 | Frogs are the Best | True | Frog |
| 5 | 6 | Llamas are Fun | True | Llama |
| 6 | 7 | I like Clouds | False | |
| 7 | 8 | Bats are Awesome | True | Bat |
| 8 | 9 | Dog Dog Dog | True | Dog |
| 9 | 10 | Hello | False |
Vectors
# Vector Manipulation, Addinv Vectors to Vectors
v1 = np.array(['#1a2b3c', '#abcdef', '#000001'])
v2 = []
v3 = []
for i in v1:
y = i.lstrip('#')
v2.append(y)
z = np.array([int(y[:2],16), int(y[2:4],16), int(y[4:6],16)])
v3.append(z)
v2,v3
DataFrames
# Fill NaN
df1 = pd.DataFrame(np.nan, index=[0, 1, 2, 3], columns=['A', 'B'])
print(df1)
df2 = df1.fillna(value = '0.0')
print(df2)
A B
0 NaN NaN
1 NaN NaN
2 NaN NaN
3 NaN NaN
A B
0 0.0 0.0
1 0.0 0.0
2 0.0 0.0
3 0.0 0.0
Math
Math - Python Documentation Link
# Numpy Vector Math
v1 = np.array([1,2,3])
v2 = np.array([2,3,4])
print(v1 * v2)
print(v2 / v1)
# Math functions
print(math.sqrt(v1[0]))
print(math.factorial(2))
print(math.fmod(19, 5))
Dates and Times
# String to Datetime
datetime = pd.to_datetime('01/01/2022')
current_date = datetime.now() # current date
# Datetime Oject
'datetime64[ns]'
# Column to Date
pd.to_datetime(table['datetime_col'].apply(lambda x: x.date()))
pd.to_datetime(table['date'], format = '%b/%d/%Y')
# Time Delta (difference between two days)
numofdays(date2, date1)
c = date2 - date1
min_diff = c.seconds / 60
# Extract Date Parts
datetime.year
datetime.month
datetime.day
datetime.date() # no time
# Format datetime
datetime.strftime('%Y-%m-%d') # YYYY-mm-dd format
# Add time to a date
next_year = current_date + relativedelta(years=1)
# ISO Week Month Function
def week_month(date):
date = pd.to_datetime(date)
year = date.strftime('%G')
week = date.strftime('%V')
start_date = pd.to_datetime(str(week) + '-Mon-' + str(year), format='%V-%a-%G')
return start_date
Plots and Figures
Machine Learning
Exploration
# Corr Matrix
corr_matrix = ml_df.corr()
corr_matrix['y_col'].sort_values(ascending = False)
# Histograms
ml_df.hist(bins=50, figzise = (50,50))
ML Pipeline
# Seperate Y
ml_df_y = ml_df['y_col'].copy()
# Fill N/A
ml_df['column1'].fillna('None', inplace=True)
column2_med = ml_df['column2'].median
ml_df['column2'].fillna(column2_med, inplace=True)
# Additional Columns
ml_df['extra_col'] = ml_df['column3'] / ml_df['column4']
# Drop Columns
ml_df = ml_df.drop(columns ={'bad_col'})
# Pipeline
cat_attribs = ['column1']
num_attribs = ['column2', 'column3', 'column4', 'extra_col']
num_pipeline = Pipeline([
('std_scaler', StandardScaler())
])
cat_pipeline = Pipeline([
('ohe', OneHotEncoder())
])
full_pipeline = ColumnTransformer([
('num', num_pipeline, num_attribs),
('cat', cat_pipeline, cat_attribs),
])
ml_df_matrix = full_pipeline.fit_transform(ml_df)
Train Test Split
# Split
train_set_x, test_set_x, train_set_y, test_set_y = train_test_split(ml_df_matrix, ml_df_y, test_size=0.2, random_state=3)
# Feature Names
pipleline = make_pipeline(full_pipeline, LinearRegression())
pipeline.fit(ml_df, ml_df['y_col'])
feature_names = pipeline[:-1].get_feature_names_out()
Average Model
Compare a model based on the difference between the predicted overall average. Useful in comparing against different models. Calculates the Root Mean Squared Error.
test_set_y = np.array(test_set_y, dtype=float)
train_set_y = np.array(train_set_y, dtype=float)
train_set_mean = train_set_y.mean()
avg_array = [train_set_mean] * len(test_set_y)
avg_mse = mean_squared_error(test_set_y, avg_array)
avg_rmse = np.sqrt(avg_mse)
Example: Random Forest Regressor
An ML example, using the Random Forest Regressor model. Model is compared against the Root Mean Squared Error.
# Train
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(train_set_x, train_set_y)
# Evaluate
test_set_y = np.array(test_set_y, dtype=float)
predict = forest_reg.predict(test_set_y)
forest_mse = mean_squared_error(test_set_y, predict)
Models
SciKit Learn
Supervised:
- OLS
- Beysian Regression
- Logistic Regression
- General Linear Regression
- Classification - Decision Trees
- Unsuperivised Nearest Neighbors
- Random Forest
Unsupervised:
Statsmodels