逻辑回归算法
1、准备数据
0 1:59 2:2 3:43.4 4:2 5:1 0 1:36 2:1 3:57.2 4:1 5:1 0 1:61 2:2 3:190 4:2 5:1 1 1:58 2:3 3:128 4:4 5:3 1 1:55 2:3 3:80 4:3 5:4 0 1:61 2:1 3:94.4 4:2 0 1:38 2:1 3:76 4:1 5:1 0 1:42 2:1 3:240 4:3 5:2 0 1:50 2:1 3:74 4:1 5:1 0 1:58 2:3 3:68.6 4:2 5:2 0 1:68 2:3 3:132.8 4:4 5:2 1 1:25 2:2 3:94.6 4:4 5:3 0 1:52 2:1 3:56 4:1 5:1 0 1:31 2:1 3:47.8 4:2 5:1 1 1:36 2:3 3:31.6 4:3 5:1 0 1:42 2:1 3:66.2 4:2 5:1 1 1:14 2:3 3:138.6 4:3 5:3 0 1:32 2:1 3:114 4:2 5:3 0 1:35 2:1 3:40.2 4:2 5:1 1 1:70 2:3 3:177.2 4:4 5:3 1 1:65 2:2 3:51.6 4:4 5:4 0 1:45 2:2 3:124 4:2 5:4 1 1:68 2:3 3:127.2 4:3 5:3 0 1:31 2:2 3:124.8 4:2 5:3
2、python算法
from sklearn import datasets
# 读取文件
data = datasets.load_svmlight_file("../../wa.txt")
x = data[0]
y = data[1]
# 切分数据
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test =train_test_split(x,y, test_size=0.33, random_state=0)
# 标准化数据
from sklearn.preprocessing import StandardScaler
sc = StandardScaler(copy=False, with_mean=False, with_std=True)
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
# 训练模型
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train, Y_train)
# 测试数据
Y_pred = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
cm = confusion_matrix(Y_test, Y_pred)
print(cm) # print confusion_matrix
print(classification_report(Y_test, Y_pred)) # print classification report3、spark算法
package com.sunbin
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{SQLContext, SparkSession}
import org.apache.log4j.{ Level, Logger }
/**
* 使用逻辑回归做胃癌 转移 分类
*/
object LogisticRegressionTest {
def main(args: Array[String]): Unit = {
Logger.getRootLogger.setLevel(Level.WARN)
val conf = new SparkConf().setMaster("local").setAppName("logistic")
val sc = SparkSession.builder().config(conf).getOrCreate()
/**
* 标注点LabeledPoint是一种带有标签(Label/Response)的本地向量,它可以是稠密或者是稀疏的。
* 在MLlib中,标注点在监督学习算法中被使用。由于标签是用双精度浮点型来存储的,故标注点类型在回归
* (Regression)和分类(Classification)问题上均可使用。例如,对于二分类问题,则正样本的标签为1,
* 负样本的标签为0,而对于多类别的分类问题来说,标签则应是一个以0开始的索引序列:0, 1, 2 ...
*/
val data:RDD[LabeledPoint] = MLUtils.loadLibSVMFile(sc.sparkContext,"wa.txt")
//测试集和训练集并不一定按2:8的比例分
val splitData = data.randomSplit(Array(0.8,0.2),2L)
val training=splitData(0).cache()
val test=splitData(1)
//建立LogisticRegressionWithLBFGS对象,设置分类数 3 ,run传入训练集开始训练,返回训练后的模型
val model = new LogisticRegressionWithLBFGS().setNumClasses(2).run(training)
//使用训练后的模型对测试集进行测试,同时打印标签和测试结果
val predictionAndLabels = test.map { case LabeledPoint(label, features) =>
val prediction = model.predict(features)
(prediction, label)
}
predictionAndLabels.foreach(println)
}
} 相关推荐
Wyt00 2020-06-08
EchoYY 2020-04-29
theta = np.zeros #theta = array,构造全为零的行向量。grad[0,j] = np.sum/len #∑term / m. return value > threshol
Kwong 2020-04-26
playoffs 2020-04-24
算法与数学之美 2020-04-20
wangqing 2020-02-15
chognzhihongseu 2020-01-30
kingzone 2020-01-29
sxyhetao 2019-12-19
lixiaotao 2019-12-17
PeterHuang0 2019-11-29
yukyinbaby 2019-11-08
wbingyang 2019-10-20
darlingtangli 2019-07-12
caozhenjun0 2016-09-25
只布布倩 2016-02-19
yonezcy 2019-06-29
YUAN 2019-06-28
yonezcy 2019-06-27