ISOM Department
$
or >>>
prompt!
to run terminal commands within a cell>>> import this
* Can be used to train and fine-tune DL models
github.com
with github.dev
in the URL for any repoprint("Hello World")
x = 5
y = "Hello"
z = [1,2,3]
y = 10 # replaced value
?
or ??
after a function or variablex = "5"
y = int(x)
z = float(x)
Precedence | Operator | Description |
---|---|---|
1 | () | Parentheses |
2 | ** | Exponentiation |
3 | +x , -x , ~x | Unary plus, Unary minus, Bitwise NOT |
4 | * , / , // , % | Multiplication, Division, Floor division, Modulus |
5 | + , - | Addition, Subtraction |
Precedence | Operator | Description |
---|---|---|
6 | << , >> | Bitwise shift operators |
7 | & | Bitwise AND |
8 | ^ | Bitwise XOR |
9 | | | Bitwise OR |
Precedence | Operator | Description |
---|---|---|
10 | == , != , < , <= , > , >= | Comparisons, Equality, Inequality |
11 | not | Logical NOT |
12 | and | Logical AND |
13 | or | Logical OR |
x = []
y = [1,2,3]
z = [1,2,3 ,["foo", "bar"]]
x = {}
y = set()
z = {"a":1, "b":2, "c":3}
k = {1,2,3,4,4,5}
x = [1,2,3,4,5]
print(x[0])
print(x[-1])
print(x[5])
x = [1,2,3,4,5]
print(x[0:3])
print(x[:3])
print(x[3:])
print(x[::2])
print(x[::-1])
x = [1,2,3,4,5]
for i in x:
print(i) # this is a code block
:
def function():
# do something
if condition:
# do something
# do something else
if another_condition:
# do something
else:
# do something else
# do something else
if condition:
# do something
# do something else
if another_condition:
# do something
else:
# do something else
else:
# do something else
# do something else
for i in range(10):
# do something
# do something else
if i % 2 == 0:
# do something
else:
# do something else
if condition:
# do something
else:
# do something else
x = [1,2,3,4,5]
for i in x:
print(i)
x = [1,2,3,4,5]
for i in range(len(x)):
print(x[i])
x = [1,2,3,4,5]
i = 0
while i < len(x):
print(x[i])
i += 1
x = [1,2,3,4,5]
y = []
for i in x:
if i % 2 == 0:
y.append(i)
print(y)
x = [1,2,3,4,5]
y = [i for i in x if i % 2 == 0]
print(y)
list()
to evaluateif condition:
# do something
elif condition: # optional
# do something else
else: # optional
# do something else
==
, !=
, <
, <=
, >
, >=
and
, or
, not
in
, not in
is
, is not
Value Type | Truthy | Falsy |
---|---|---|
Boolean | True | False |
Null | None | |
Zero | 0 , 0.0 , 0j | |
Empty Collections | "" , () , [] , {} , set() , range(0) | |
Others | Any other value not listed in Falsy |
def my_function():
print("Hello from a function")
my_function()
def my_function(name):
print("Hello " + name)
def my_function(x):
return x**2
# execute
print(my_function(2))
def my_function(x=2):
return x**2
# execute
print(my_function()) # Where is the argument?
def my_function(x=2, y=3):
return x**y
# execute
print(my_function(y=2, x=3))
# or
print(my_function(x=3, y=2))
print(my_function(3, 2)) # positional
# or
print(my_function(x=3)) # keyword
# or
print(my_function(y=3))
# or
print(my_function())
What are the arguments?
def my_function(*args):
return sum(args)
# execute
print(my_function(1,2,3,4,5))
print(my_function(1,2,3))
def my_function(**kwargs):
return kwargs
# execute
print(my_function(a=1, b=2, c=3))
print(my_function(x=1, y=2))
args
and kwargs
are just namesargs
is a tuplekwargs
is a dictionaryargs
and kwargs
are optional and must be at the endargs
(positional) before kwargs
(keyword)args
and kwargs
can be used togetherprint(x)
print("x")
print(input)
print(input())
z = print
z("hello world")
print("Hello world")
print = input
print("Hello world")
f = lambda x: x**2
print(f(2))
num_list = [1,2,3,4,5]
# in just one line
squared = list(map(lambda x: x**2, num_list))
num_list = [1,2,3,4,5]
# Three lines of code
def f(x):
return x**2
squared = list(map(f, num_list))
import pandas as pd
df = pd.DataFrame({"col1": [1,2,3]}) # instance
df2 = pd.DataFrame({"A": [4,5,6]}, {"B": [7,8,9]}) # another instance
# calling a method
df.head()
df2.head()
# accessing a property
df.columns
df2.columns
dir()
and help()
import math #simple import
# calling a library function
math.sqrt(4)
# Discovering the library
dir(math)
help(math.sqrt)
Always prefix the function with the library name
from math import sqrt # import a specific function
# calling the function
sqrt(4)
from math import * # import everything
# calling the function
sqrt(4)
Risk of overwriting functions
import pandas as pd # aliasing
# calling a function/class
pd.DataFrame()
Conventional for most data science libraries
$ pip install pandas
Remember:
pip3
instead of pip
sudo
on linux and mac or run terminal as administrator on windowsprint()
function$ pip install pandas
Anaconda python comes with Pandas and other libraries pre-installed.
import pandas as pd # conventional alias
Setup and configuration typically done at top of script or notebook
# Assumed for all examples
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import sklearn
Introduction to Series and DataFrame Creating a Series and DataFrame Accessing and modifying data in Series and DataFrame
# from a list
s = pd.Series([1, 2, 3, 4, 5])
print(s)
# from a dictionary
d = {'a': 1, 'b': 2, 'c': 3}
s = pd.Series(d)
print(s)
# from a scalar value
s = pd.Series(5, index=[0, 1, 2, 3, 4])
print(s)
s = pd.Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 'd', 'e'])
# Accessing data
print(s['a'])
print(s[0])
# Modifying data
s['a'] = 6
print(s)
# from a dictionary
d = {'col1': [1, 2, 3], 'col2': [4, 5, 6]}
df = pd.DataFrame(d)
print(df)
# from a list of dictionaries
data = [{'a': 1, 'b': 2}, {'a': 3, 'b': 4, 'c': 5}]
df = pd.DataFrame(data)
print(df)
# from a list of lists
data = [[1, 2, 3], [4, 5, 6]]
df = pd.DataFrame(data, columns=['a', 'b', 'c'])
print(df)
[] and .
loc, iloc, and []
loc, iloc, and []
at and iat
df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]})
# single column
print(df['a'])
# or
print(df.a)
# multiple columns
print(df[['b', 'a']])
df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}, index=['x', 'y', 'z'])
# single row by number
print(df.loc[0])
# or by index
print(df.iloc['x'])
# multiple rows (slices)
print(df.iloc[0:1])
# or by index
print(df.loc['x':'y']) # maintains order
df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}, index=['x', 'y', 'z'])
# single cell
print(df.at['x', 'a'])
# or
print(df.iat[0, 0])
# multiple cells ()
print(df.loc['x':'y', 'a':'b'])
# or
print(df.iloc[0:2, 0:2])
print(df.loc[:, 'a':'b'])
print(df.iloc[:, 0:2])
df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}, index=['x', 'y', 'z'])
# single condition
print(df[df['a'] > 1])
# multiple conditions
print(df[(df['a'] > 1) & (df['b'] < 6)])
# or using query
print(df.query('a > 1 and b < 6'))
read_*
functionsdf = pd.read_csv('file.csv')
df = pd.read_excel('file.xlsx')
df = pd.read_sql('SELECT * FROM table', connection)
df = pd.read_json('file.json')
df = pd.read_html('http://example.com/tables.html')
df = pd.read_stata('file.dta')
df = pd.read_sas('file.sas7bdat')
df = pd.read_parquet('file.parquet') # requires pyarrow
df
for easy referencedf_data = pd.read_csv('file.csv')
url = "https://raw.githubusercontent.com/resbaz/r-novice-gapminder-files/master/data/gapminder-FiveYearData.csv"
df = pd.read_csv(url)
# Displaying the first few rows
print(df.head())
# Displaying the last few rows
print(df.tail())
# Displaying the shape
print(df.shape)
# Displaying the columns
print(df.columns)
# Displaying the data types
print(df.dtypes)
# Displaying the summary statistics
print(df.describe())
# Displaying the unique values
print(df['country'].unique())
# Displaying the value counts
print(df['country'].value_counts())
to_*
functionsdf.to_csv('file.csv')
df.to_excel('file.xlsx')
df.to_sql('table', connection)
df.to_json('file.json')
df.to_html('file.html')
df.to_stata('file.dta')
df.to_sas('file.sas7bdat')
df.to_parquet('file.parquet') # requires pyarrow
utf-8
read_*
functionsastype
method in DataFramedf = pd.DataFrame({
'a': [1, 2, np.nan],
'b': [4, np.nan, 6],
'c': [np.nan, 8, 9]
})
print(df)
# drop rows with missing data
print(df.dropna())
# drop columns with missing data
print(df.dropna(axis=1))
# fill missing data
print(df.fillna(0))
df = pd.DataFrame({
'a': [1, 2, 2, 3],
'b': [4, 5, 5, 6],
'c': [7, 8, 8, 9],
})
print(df)
# drop duplicates
print(df.drop_duplicates())
# drop duplicates based on a column
print(df.drop_duplicates(subset=['a']))
# display duplicates
print(df[df.duplicated()])
# display duplicates based on a column
print(df[df.duplicated(subset=['a'])])
# Try to print df after each operation
# What do you notice?
inplace
argument to change the original DataFrameinplace
df = pd.DataFrame({
'a': [1, 2, 2, 3],
'b': [4, 5, 5, 6],
'c': [7, 8, 8, 9],
})
print(df)
# drop duplicates, use this with any
# operation that changes the dataframe
df = df.drop_duplicates()
print(df)
# drop duplicates using inplace
df.drop_duplicates(inplace=True)
print(df)
df = pd.DataFrame({
'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9],})
print(df)
# rename columns
df_renamed = df.rename(columns={'a': 'A', 'b': 'B'})
print(df_renamed)
# alternatively
df.columns = ['A', 'B', 'C']
print(df)
# replace values
df_replaced = df_renamed.replace(1, 100)
print(df_replaced)
import pandas as pd
url = "https://raw.githubusercontent.com/resbaz/r-novice-gapminder-files/master/data/gapminder-FiveYearData.csv"
df = pd.read_csv(url)
print(df.describe())
# or
print(df['gdpPercap'].mean())
print(df['gdpPercap'].median())
print(df['gdpPercap'].std())
print(df['gdpPercap'].min())
print(df['gdpPercap'].max())
print(df['gdpPercap'].quantile(0.25))
print(df['gdpPercap'].quantile(0.75))
# use gapminder df, it is in df
print(df.groupby('country')['gdpPercap'].mean())
# or
print(df.groupby('country')['gdpPercap'].agg(
['mean', 'median', 'std', 'min', 'max', 'count']))
# or
print(df.groupby('country').agg(
{
'gdpPercap': ['mean', 'median', 'std', 'min', 'max', 'count'],
'lifeExp': ['mean', 'median', 'std', 'min', 'max', 'count'],
}
))
transform
method to aggregate without reshaping the DataFramedf['gdpPercap_mean'] = df.groupby('country')['gdpPercap'].transform('mean')
print(df)
print(df.corr())
print(df.cov())
# You need to select the numeric columns
corr_matrix = df[['year', 'pop', 'lifeExp', 'gdpPercap']].corr()
cov_matrix = df[['year', 'pop', 'lifeExp', 'gdpPercap']].cov()
# To correlate specific columns
df['lifeExp'].corr(df['gdpPercap'])
df['lifeExp'].cov(df['gdpPercap'])
# just choose the column and call the plot method
df['gdpPercap'].plot()
# You can choose the kind of plot
df['gdpPercap'].plot(kind='hist')
# You can also use the plot method of the DataFrame
df.plot(x='year', y='gdpPercap', kind='scatter')
df['gdpPercap'].cumsum().plot()
import matplotlib.pyplot as plt
# Create a figure
fig = plt.figure()
# Create a subplot
ax = fig.add_subplot(111)
# this creates a 1x1 grid of subplots and returns the first one
# alternatively, you can use fig.add_subplot(1, 1, 1)
# Plot data on the subplot
ax.plot([1, 2, 3, 4], [10, 20, 25, 30])
# x-axis is the first list and y-axis is the second list
# replace lists with pandas series or numpy arrays
# Customize the plot (optional)
ax.set_xlabel('x-axis')
ax.set_ylabel('y-axis')
ax.set_title('Title')
# Save the plot (optional)
plt.savefig('plot.png')
# Show the plot
plt.show()
fig, axs = plt.subplots(2, 2)
axs[0, 0].plot(df['year'], df['gdpPercap'])
axs[0, 0].set_title('GDP Per Capita')
axs[0, 1].plot(df['year'], df['lifeExp'])
axs[0, 1].set_title('Life Expectancy')
axs[1, 0].plot(df['year'], df['pop'])
axs[1, 0].set_title('Population')
axs[1, 1].plot(df['year'], df['gdpPercap'] * df['pop'])
axs[1, 1].set_title('GDP')
plt.show()
Plots were a mess, can you fix them?
df_kenya = df[df['country'] == 'Kenya']
# or use query
df_kenya = df.query('country == "Kenya"')
# replace df with df_kenya in the previous example
import seaborn as sns
# Code is similar to matplotlib
# but you can also use the sns function
sns.lineplot(x='year', y='gdpPercap', data=df)
# scatter plot grouped by continent
sns.scatterplot(x='year', y='gdpPercap', data=df, hue='continent')
# box plot grouped by continent
sns.boxplot(x='continent', y='gdpPercap', data=df)
# violin plot grouped by continent
sns.violinplot(x='continent', y='gdpPercap', data=df)
# pair plot
sns.pairplot(df[['gdpPercap', 'lifeExp', 'pop']])
# heatmap
sns.heatmap(df[['year', 'pop', 'lifeExp', 'gdpPercap']].corr())
# count plot
sns.countplot(x='continent', data=df)
sns.set_theme(style='whitegrid') # alternatives include 'darkgrid', 'white',
# 'dark', 'ticks'
sns.set_palette('pastel') # alternatives include 'deep', 'muted',
# 'bright', 'dark', 'colorblind'
sns.set_context('talk') # this sets the font size for talk
# you can also use 'paper', 'notebook', 'poster'
Name | treatmenta | treatmentb |
---|---|---|
John Smith | — | 2 |
Jane Doe | 16 | 11 |
Mary Johnson | 3 | 1 |
-Is this tidy?
source: (Wickham 2014)
Name | John Smith | Jane Doe | Mary Johnson |
---|---|---|---|
treatmenta | — | 16 | 3 |
treatmentb | 2 | 11 | 1 |
-Is this tidy?
source: (Wickham 2014)
Name | Treatment | Result |
---|---|---|
John Smith | a | — |
Jane Doe | a | 16 |
Mary Johnson | a | 3 |
John Smith | b | 2 |
Jane Doe | b | 11 |
Mary Johnson | b | 1 |
-Is this tidy?
source: (Wickham 2014)
df = pd.DataFrame({
'Name': ['John Smith', 'Jane Doe', 'Mary Johnson'],
'treatmenta': [None, 16, 3],
'treatmentb': [2, 11, 1]})
# Melt the DataFrame to long format
df_long = df.melt(
id_vars=['Name'],
var_name='Treatment',
value_name='Result')
print(df_long)
Name | Treatment | Result |
---|---|---|
John Smith | treatmenta | — |
Jane Doe | treatmenta | 16 |
Mary Johnson | treatmenta | 3 |
John Smith | treatmentb | 2 |
Jane Doe | treatmentb | 11 |
Mary Johnson | treatmentb | 1 |
Lets fix treatment to be a and b
# just remove the word treatment from the data
tidy_df['Treatment'] = tidy_df['Treatment'].str.replace('treatment', '')
df.melt(id_vars=['Name'], var_name='Treatment', value_name='Result')
id_vars
specifies the columns to keep as is (identifier variables)var_name
specifies the name of the new column that contains the wide column namesvalue_name
specifies the name of the new column that contains the values from the wide columns
df_long.pivot(index='Name', columns='Treatment', values='Result')
pivot
is a reshaping methodValueError
pivot_table
is a reshaping method
df = pd.DataFrame({'A': ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'],
'B': ['one', 'one', 'two', 'two', 'one', 'one'],
'C': ['small', 'large', 'large', 'small', 'small', 'large'],
'D': [1, 2, 2, 3, 3, 4]})
print(df)
# pivot table
print(df.pivot_table(index='A', columns='B', values='D'))
# More arguments
print(df.pivot_table(index='A',
columns=['B', 'C'], values='D', aggfunc='sum',
fill_value=0, margins=True, margins_name='Total'))
import statsmodels.api as sm
data = sm.datasets.longley.load_pandas()
print(data.data) # whole dataset
print(data.endog) # endogenous variable
print(data.exog) # exogenous variables
print(data.endog_name)
print(data.exog_name)
res = sm.OLS(data.endog, data.exog).fit()
print(res.summary())
import statsmodels.api as sm
import matplotlib.pyplot as plt
fig = sm.graphics.qqplot(res.resid, line='q')
plt.show()
import statsmodels.api as sm
import statsmodels.formula.api as smf
data = sm.datasets.get_rdataset('epil', package='MASS').data
print(data)
mod = smf.glm(
"y ~ age + trt + base",
data,
family=sm.families.Poisson()
)
res = mod.fit()
print(res.summary())
# check for overdispersion
if res.pearson_chi2 / res.df_resid > 1:
print('We have overdispersion')
else:
print('No evidence to suggest overdispersion')
Quarto
and select Create New Project
Course Material
https://malmarz.netlify.app/en/courses/pyintro/
Dont’ Forget