Skip to content

Latest commit

 

History

History
182 lines (153 loc) · 6.05 KB

机器学习笔记.md

File metadata and controls

182 lines (153 loc) · 6.05 KB

鸢尾花数据矩阵图

import matplotlib.pyplot as plt
from scipy import sparse
import numpy as np
import matplotlib as mt
import pandas as pd
from IPython.display import display
from sklearn.datasets import load_iris
import sklearn as sk
from sklearn.model_selection import train_test_split
 
iris=load_iris()
#print(iris)
X_train,X_test,y_train,y_test = train_test_split(iris['data'],iris['target'],random_state=0)
iris_dataframe = pd.DataFrame(X_train,columns=iris.feature_names)
grr = pd.plotting.scatter_matrix(iris_dataframe,c=y_train,figsize=(15,15),marker='o',hist_kwds={'bins':20},s=60,alpha=.8)
plt.show()


# 1、frame,pandas dataframe对象
# 2、alpha, 图像透明度,一般取(0,1]
# 3、figsize,以英寸为单位的图像大小,一般以元组 (width, height) 形式设置
# 4、ax,可选一般为none
# 5、diagonal,必须且只能在{‘hist', ‘kde'}中选择1个,'hist'表示直方图(Histogram # plot),'kde'表示核密度估计(Kernel Density # Estimation);该参数是scatter_matrix函数的关键参数
# 6、marker,Matplotlib可用的标记类型,如'.',',','o'等
# 7、density_kwds,(other plotting keyword arguments,可选),与kde相关的字典参数
# 8、hist_kwds,与hist相关的字典参数
# 9、range_padding,(float, # 可选),图像在x轴、y轴原点附近的留白(padding),该值越大,留白距离越大,图像远离# 坐标原点
# 10、kwds,与scatter_matrix函数本身相关的字典参数
# 11、c,颜色

plt画散点图

plt.scatter() 散点图
plt.scatter(x,y,s=20,c = None,marker = 'o',cmap = none,norm = none,vmin = none,vmax = none,alpha = none,linewidths = none,verts = none,edgecolors = none,hold = none,**kwargs)

knn算法(分类)

# 1. 导入依赖包
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split


# 2. 获取数据
iris_data = load_iris()

# 3. 切分出训练集和测试集
X_train,X_test,y_train,y_test  = train_test_split(iris_data['data'],iris_data['target'],random_state=0)

# 4. 分类近邻点个数设置
knn = KNeighborsClassifier(n_neighbors=1)

# 5. 模型生成
knn.fit(X_train, y_train)

knn算法(回归)

from sklearn.neighbors import KNeighborsRegressor
X, y = mglearn.datasets.make_wave(n_samples=40)
# split the wave dataset into a training and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
# instantiate the model and set the number of neighbors to consider to 3
reg = KNeighborsRegressor(n_neighbors=3)
# fit the model using the training data and training targets
reg.fit(X_train, y_train)
print("Test set predictions:\n{}".format(reg.predict(X_test)))
print("Test set R^2: {:.2f}".format(reg.score(X_test, y_test)))

线性回归

from sklearn.linear_model import LinearRegression
X, y = mglearn.datasets.make_wave(n_samples=60)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
lr = LinearRegression().fit(X_train, y_train)
# 打印系数
print("lr.coef_: {}".format(lr.coef_))
# 打印截距
print("lr.intercept_: {}".format(lr.intercept_))

# 训练集分数
print("Training set score: {:.2f}".format(lr.score(X_train, y_train)))
# 测试集分数
print("Test set score: {:.2f}".format(lr.score(X_test, y_test)))

Ridge 回归

# %%
from sklearn.linear_model import Ridge
ridge = Ridge().fit(X_train,y_train)
# %%
ridge.score(X_train,y_train)

# %%
ridge.score(X_test,y_test)

# 调参
ridge10 = Ridge(alpha=.1).fit(X_train, y_train)
print("Training set score: {:.2f}".format(ridge10.score(X_train, y_train)))
print("Test set score: {:.2f}".format(ridge10.score(X_test, y_test)))

Lasso 回归

from sklearn.linear_model import Lasso
lasso = Lasso()..fit(X_train, y_train)

lasso001 = Lasso(alpha=0.01, max_iter=100000).fit(X_train, y_train)
print("Training set score: {:.2f}".format(lasso001.score(X_train, y_train)))
print("Test set score: {:.2f}".format(lasso001.score(X_test, y_test)))
print("Number of features used: {}".format(np.sum(lasso001.coef_ != 0)))

LogisticRegression

from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(
cancer.data, cancer.target, stratify=cancer.target, random_state=42)
logreg = LogisticRegression(C=100).fit(X_train, y_train)
print("Training set score: {:.3f}".format(logreg.score(X_train, y_train)))
print("Test set score: {:.3f}".format(logreg.score(X_test, y_test)))

RandomForestClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_moons
X, y = make_moons(n_samples=100, noise=0.25, random_state=3)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,
random_state=42)
forest = RandomForestClassifier(n_estimators=5, random_state=2)
forest.fit(X_train, y_train)

GradientBoostingClassifier

from sklearn.ensemble import GradientBoostingClassifier
X_train, X_test, y_train, y_test = train_test_split(
cancer.data, cancer.target, random_state=0)
gbrt = GradientBoostingClassifier(random_state=0)
gbrt.fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(gbrt.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(gbrt.score(X_test, y_test)))

linersvm

from sklearn.datasets import make_blobs
from sklearn.svm import LinearSVC
X, y = make_blobs(centers=4, random_state=8)
y = y % 2
mglearn.discrete_scatter(X[:, 0], X[:, 1], y)
plt.xlabel("Feature 0")
plt.ylabel("Feature 1")
linear_svm = LinearSVC().fit(X,y)
print("Coefficient shape: ", linear_svm.coef_.shape)
print("Intercept shape: ", linear_svm.intercept_.shape
from sklearn.tree import DecisionTreeClassifier
cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(
cancer.data, cancer.target, stratify=cancer.target, random_state=42)
tree = DecisionTreeClassifier(random_state=0)
tree.fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(tree.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(tree.score(X_test, y_test)))