import graphviz
# This output is generated from the model
# as you can see, there are several decisions
# to make, and the hyrarchy can be compiled like a tree
'cache/iristree.dot') graphviz.Source.from_file(
Machine Learning - Supervised Learning
first we need to import pandas and load the data with pd.read_csv(file)
import pandas as pd
= pd.read_csv('datasets/iris/Iris.csv') iris
this is the overview of the data
iris
Id | SepalLengthCm | SepalWidthCm | PetalLengthCm | PetalWidthCm | Species | |
---|---|---|---|---|---|---|
0 | 1 | 5.1 | 3.5 | 1.4 | 0.2 | Iris-setosa |
1 | 2 | 4.9 | 3.0 | 1.4 | 0.2 | Iris-setosa |
2 | 3 | 4.7 | 3.2 | 1.3 | 0.2 | Iris-setosa |
3 | 4 | 4.6 | 3.1 | 1.5 | 0.2 | Iris-setosa |
4 | 5 | 5.0 | 3.6 | 1.4 | 0.2 | Iris-setosa |
... | ... | ... | ... | ... | ... | ... |
145 | 146 | 6.7 | 3.0 | 5.2 | 2.3 | Iris-virginica |
146 | 147 | 6.3 | 2.5 | 5.0 | 1.9 | Iris-virginica |
147 | 148 | 6.5 | 3.0 | 5.2 | 2.0 | Iris-virginica |
148 | 149 | 6.2 | 3.4 | 5.4 | 2.3 | Iris-virginica |
149 | 150 | 5.9 | 3.0 | 5.1 | 1.8 | Iris-virginica |
150 rows × 6 columns
if we wan to check detail about the data, use data.info()
iris.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Id 150 non-null int64
1 SepalLengthCm 150 non-null float64
2 SepalWidthCm 150 non-null float64
3 PetalLengthCm 150 non-null float64
4 PetalWidthCm 150 non-null float64
5 Species 150 non-null object
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB
the Id is not needed in this case, therefore we can just drop it
# stropping unneeded data
'Id', axis=1, inplace=True) iris.drop(
Id | SepalLengthCm | SepalWidthCm | PetalLengthCm | PetalWidthCm | Species | |
---|---|---|---|---|---|---|
0 | 1 | 5.1 | 3.5 | 1.4 | 0.2 | Iris-setosa |
1 | 2 | 4.9 | 3.0 | 1.4 | 0.2 | Iris-setosa |
2 | 3 | 4.7 | 3.2 | 1.3 | 0.2 | Iris-setosa |
3 | 4 | 4.6 | 3.1 | 1.5 | 0.2 | Iris-setosa |
4 | 5 | 5.0 | 3.6 | 1.4 | 0.2 | Iris-setosa |
now, we separate the data label from its features, and save it to X and y, we need also to separate the data to train and split
= iris[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']]
X = iris['Species']
y
from sklearn.model_selection import train_test_split
= train_test_split(X, y, test_size=0.1, random_state=123) X_train, X_test, y_train, y_test
we prepare the model by calling it from sklearn.tree
from sklearn import tree
= tree.DecisionTreeClassifier() clf
# With defined train test split
= clf.fit(X_train, y_train) clf
conclusion on cross_val_score
cross_val_score is for validating the quality of data set, it’s consider good if it’s more than 0.85
# with cross validation
from sklearn.model_selection import cross_val_score
= cross_val_score(clf, X, y, cv=5) scores
scores
array([0.96666667, 0.96666667, 0.9 , 0.96666667, 1. ])
# model evaluation
from sklearn.metrics import accuracy_score
= clf.predict(X_test)
y_pred
print(y_pred)
print(y_test)
= round(accuracy_score(y_pred, y_test), 3)
acc_score
print('accuracy', acc_score)
['Iris-virginica' 'Iris-virginica' 'Iris-virginica' 'Iris-versicolor'
'Iris-setosa' 'Iris-virginica' 'Iris-versicolor' 'Iris-setosa'
'Iris-setosa' 'Iris-versicolor' 'Iris-virginica' 'Iris-setosa'
'Iris-versicolor' 'Iris-virginica' 'Iris-virginica']
72 Iris-versicolor
112 Iris-virginica
132 Iris-virginica
88 Iris-versicolor
37 Iris-setosa
138 Iris-virginica
87 Iris-versicolor
42 Iris-setosa
8 Iris-setosa
90 Iris-versicolor
141 Iris-virginica
33 Iris-setosa
59 Iris-versicolor
116 Iris-virginica
135 Iris-virginica
Name: Species, dtype: object
accuracy 0.933
print(clf.predict([[6.2, 3.4, 5.4, 2.3]])[0])
from sklearn.tree import export_graphviz
export_graphviz(
clf,= 'cache/iristree.dot',
out_file = ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm'],
feature_names = ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica' ],
class_names = True,
rounded=True) filled
Regression
Linear Regression
import numpy as np
# make dummy data of rooms
= np.array([1,1,2,2,3,4,4,5,5,5])
bedrooms
# make dummy price data in dolar
= np.array([15000, 18000, 27000, 34000, 50000, 68000, 65000, 81000,85000, 90000]) house_price
# visualize in scatterplot
import matplotlib.pyplot as plt
%matplotlib inline
plt.scatter(bedrooms, house_price)
from sklearn.linear_model import LinearRegression
# train the model with LinearRegression.fit()
= bedrooms.reshape(-1, 1)
bedrooms = LinearRegression()
linreg linreg.fit(bedrooms, house_price)
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
# plotting the corelation between number of rooms and house_prices
plt.scatter(bedrooms, house_price) plt.plot(bedrooms, linreg.predict(bedrooms))
Logistic Regression
import pandas as pd
= pd.read_csv('datasets/socmedAds/Social_Network_Ads.csv')
df df
User ID | Gender | Age | EstimatedSalary | Purchased | |
---|---|---|---|---|---|
0 | 15624510 | Male | 19 | 19000 | 0 |
1 | 15810944 | Male | 35 | 20000 | 0 |
2 | 15668575 | Female | 26 | 43000 | 0 |
3 | 15603246 | Female | 27 | 57000 | 0 |
4 | 15804002 | Male | 19 | 76000 | 0 |
... | ... | ... | ... | ... | ... |
395 | 15691863 | Female | 46 | 41000 | 1 |
396 | 15706071 | Male | 51 | 23000 | 1 |
397 | 15654296 | Female | 50 | 20000 | 1 |
398 | 15755018 | Male | 36 | 33000 | 0 |
399 | 15594041 | Female | 49 | 36000 | 1 |
400 rows × 5 columns
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 User ID 400 non-null int64
1 Gender 400 non-null object
2 Age 400 non-null int64
3 EstimatedSalary 400 non-null int64
4 Purchased 400 non-null int64
dtypes: int64(4), object(1)
memory usage: 15.8+ KB
= df.drop(columns=['User ID'])
data
= pd.get_dummies(data)
data data
Age | EstimatedSalary | Purchased | Gender_Female | Gender_Male | |
---|---|---|---|---|---|
0 | 19 | 19000 | 0 | False | True |
1 | 35 | 20000 | 0 | False | True |
2 | 26 | 43000 | 0 | True | False |
3 | 27 | 57000 | 0 | True | False |
4 | 19 | 76000 | 0 | False | True |
... | ... | ... | ... | ... | ... |
395 | 46 | 41000 | 1 | True | False |
396 | 51 | 23000 | 1 | False | True |
397 | 50 | 20000 | 1 | True | False |
398 | 36 | 33000 | 0 | False | True |
399 | 49 | 36000 | 1 | True | False |
400 rows × 5 columns
= data[['Age', 'EstimatedSalary', 'Gender_Female', 'Gender_Male']]
X = data['Purchased'] y
# data normalization
from sklearn.preprocessing import StandardScaler
= StandardScaler()
scaler # calculating the mean and standard deviation of every attribute column
# to be used on every transform function
scaler.fit(X)= scaler.transform(X)
scaled_data = pd.DataFrame(scaled_data, columns=X.columns)
scaled_data scaled_data
Age | EstimatedSalary | Gender_Female | Gender_Male | |
---|---|---|---|---|
0 | -1.781797 | -1.490046 | -1.020204 | 1.020204 |
1 | -0.253587 | -1.460681 | -1.020204 | 1.020204 |
2 | -1.113206 | -0.785290 | 0.980196 | -0.980196 |
3 | -1.017692 | -0.374182 | 0.980196 | -0.980196 |
4 | -1.781797 | 0.183751 | -1.020204 | 1.020204 |
... | ... | ... | ... | ... |
395 | 0.797057 | -0.844019 | 0.980196 | -0.980196 |
396 | 1.274623 | -1.372587 | -1.020204 | 1.020204 |
397 | 1.179110 | -1.460681 | 0.980196 | -0.980196 |
398 | -0.158074 | -1.078938 | -1.020204 | 1.020204 |
399 | 1.083596 | -0.990844 | 0.980196 | -0.980196 |
400 rows × 4 columns
# validation with cross validation
from sklearn.model_selection import cross_val_score
from sklearn import linear_model
= linear_model.LogisticRegression()
model = cross_val_score(model, scaled_data, y, cv=5) scores
scores
array([0.7 , 0.95 , 0.9375, 0.8125, 0.7 ])
from sklearn.model_selection import train_test_split
= train_test_split(scaled_data, y, test_size=0.2, random_state=1) X_train, X_test, y_train, y_test
model.fit(X_train, y_train)
LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression()
# examine model accuracy
model.score(X_test, y_test)
0.825