Remove some unused files

7 years ago · b708c58ef6
--- a/0_numpy_matplotlib_scipy_sympy/matplotlib_ani2.ipynb
+++ b/0_numpy_matplotlib_scipy_sympy/matplotlib_ani2.ipynb
--- a/1_logistic_regression/Least_squares.ipynb
+++ b/1_logistic_regression/Least_squares.ipynb
--- a/1_logistic_regression/Least_squares.py
+++ b/1_logistic_regression/Least_squares.py
@@ -113,6 +113,78 @@ plt.legend()
 plt.show()
 # -

 # ## How to use iterative method to estimate parameters?
 #

 # +
 n_epoch = 3000          # epoch size
 a, b = 1, 1             # initial parameters
 epsilon = 0.001         # learning rate

 for i in range(n_epoch):
    for j in range(N):
        a = a + epsilon*2*(Y[j] - a*X[j] - b)*X[j]
        b = b + epsilon*2*(Y[j] - a*X[j] - b)

    L = 0
    for j in range(N):
        L = L + (Y[j]-a*X[j]-b)**2
    print("epoch %4d: loss = %f, a = %f, b = %f" % (i, L, a, b))
    
 x_min = np.min(X)
 x_max = np.max(X)
 y_min = a * x_min + b
 y_max = a * x_max + b

 plt.scatter(X, Y, label='original data')
 plt.plot([x_min, x_max], [y_min, y_max], 'r', label='model')
 plt.legend()
 plt.show()
 # -

 # ## How to show the iterative process

 # +
 # %matplotlib nbagg

 import matplotlib.pyplot as plt
 import matplotlib.animation as animation

 n_epoch = 3000          # epoch size
 a, b = 1, 1             # initial parameters
 epsilon = 0.001         # learning rate

 fig = plt.figure()
 imgs = []

 for i in range(n_epoch):
    for j in range(N):
        a = a + epsilon*2*(Y[j] - a*X[j] - b)*X[j]
        b = b + epsilon*2*(Y[j] - a*X[j] - b)

    L = 0
    for j in range(N):
        L = L + (Y[j]-a*X[j]-b)**2
    #print("epoch %4d: loss = %f, a = %f, b = %f" % (i, L, a, b))
    
    if i % 50 == 0:
        x_min = np.min(X)
        x_max = np.max(X)
        y_min = a * x_min + b
        y_max = a * x_max + b

        img = plt.scatter(X, Y, label='original data')
        img = plt.plot([x_min, x_max], [y_min, y_max], 'r', label='model')
        imgs.append(img)
        
 ani = animation.ArtistAnimation(fig, imgs)
 plt.show()
 # -

 # ## How to use batch update method?
 #
 # If some data is outliear, then the 

 # ## How to fit polynomial function?
 #
 # If we observe a missle at some time, then how to estimate the trajectory? Acoording the physical theory, the trajectory can be formulated as:
@@ -217,8 +289,9 @@ Y_est = regr.predict(X_test)
 print("Y_est  = ", Y_est)
 print("Y_test = ", Y_test)
 err = (Y_est - Y_test)**2
 err2 = sklearn.metrics.mean_squared_error(Y_test, Y_est)
 score = regr.score(X_test, Y_test)
 print("err = %f, score = %f" % (np.sqrt(np.sum(err))/N_test, score))
 print("err = %f (%f), score = %f" % (np.sqrt(np.sum(err))/N_test, np.sqrt(err2), score))


 # plot data
--- a/1_logistic_regression/Logistic_regression.ipynb
+++ b/1_logistic_regression/Logistic_regression.ipynb
@@ -5,12 +5,28 @@
   "metadata": {},
   "source": [
    "# Logistic Regression\n",
    "\n",
    "逻辑回归(Logistic Regression, LR)模型其实仅在线性回归的基础上，套用了一个逻辑函数，但也就由于这个逻辑函数，使得逻辑回归模型成为了机器学习领域一颗耀眼的明星，更是计算广告学的核心。本节主要详述逻辑回归模型的基础。\n",
    "\n",
    "\n",
    "## 1 逻辑回归模型\n",
    "回归是一种比较容易理解的模型，就相当于$y=f(x)$，表明自变量$x$与因变量$y$的关系。最常见问题有如医生治病时的望、闻、问、切，之后判定病人是否生病或生了什么病，其中的望闻问切就是获取自变量$x$，即特征数据，判断是否生病就相当于获取因变量$y$，即预测分类。\n",
    "\n",
    "最简单的回归是线性回归，在此借用Andrew NG的讲义，有如图所示，$X$为数据点——肿瘤的大小，$Y$为观测值——是否是恶性肿瘤。通过构建线性回归模型，如$h_\\theta(x)$所示，构建线性回归模型后，即可以根据肿瘤大小，预测是否为恶性肿瘤$h_\\theta(x)) \\ge 0.5$为恶性，$h_\\theta(x) \\lt 0.5$为良性。\n",
    "\n",
    "![LinearRegression](images/fig1.gif)\n",
    "\n",
    "然而线性回归的鲁棒性很差，例如在上图的数据集上建立回归，因最右边噪点的存在，使回归模型在训练集上表现都很差。这主要是由于线性回归在整个实数域内敏感度一致，而分类范围，需要在$[0,1]$。\n",
    "\n",
    "逻辑回归就是一种减小预测范围，将预测值限定为$[0,1]$间的一种回归模型，其回归方程与回归曲线如图2所示。逻辑曲线在$z=0$时，十分敏感，在$z>>0$或$z<<0$处，都不敏感，将预测值限定为$(0,1)$。\n",
    "\n",
    "![LogisticFunction](images/fig2.gif)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -171,6 +187,16 @@
    "logistic.train(200)\n",
    "plot_decision_boundary(lambda x: logistic.predict(x), data, label)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## References\n",
    "\n",
    "* [逻辑回归模型(Logistic Regression, LR)基础](https://www.cnblogs.com/sparkwen/p/3441197.html)\n",
    "* [逻辑回归（Logistic Regression）](http://www.cnblogs.com/BYRans/p/4713624.html)"
   ]
  }
 ],
 "metadata": {
--- a/1_logistic_regression/Logistic_regression.py
+++ b/1_logistic_regression/Logistic_regression.py
@@ -0,0 +1,132 @@
 # -*- coding: utf-8 -*-
 # ---
 # jupyter:
 #   jupytext_format_version: '1.2'
 #   kernelspec:
 #     display_name: Python 3
 #     language: python
 #     name: python3
 #   language_info:
 #     codemirror_mode:
 #       name: ipython
 #       version: 3
 #     file_extension: .py
 #     mimetype: text/x-python
 #     name: python
 #     nbconvert_exporter: python
 #     pygments_lexer: ipython3
 #     version: 3.5.2
 # ---

 # # Logistic Regression
 #
 # 逻辑回归(Logistic Regression, LR)模型其实仅在线性回归的基础上，套用了一个逻辑函数，但也就由于这个逻辑函数，使得逻辑回归模型成为了机器学习领域一颗耀眼的明星，更是计算广告学的核心。本节主要详述逻辑回归模型的基础。
 #
 #
 # ## 1 逻辑回归模型
 # 回归是一种比较容易理解的模型，就相当于$y=f(x)$，表明自变量$x$与因变量$y$的关系。最常见问题有如医生治病时的望、闻、问、切，之后判定病人是否生病或生了什么病，其中的望闻问切就是获取自变量$x$，即特征数据，判断是否生病就相当于获取因变量$y$，即预测分类。
 #
 # 最简单的回归是线性回归，在此借用Andrew NG的讲义，有如图所示，$X$为数据点——肿瘤的大小，$Y$为观测值——是否是恶性肿瘤。通过构建线性回归模型，如$h_\theta(x)$所示，构建线性回归模型后，即可以根据肿瘤大小，预测是否为恶性肿瘤$h_\theta(x)) \ge 0.5$为恶性，$h_\theta(x) \lt 0.5$为良性。
 #
 # ![LinearRegression](images/fig1.gif)
 #
 # 然而线性回归的鲁棒性很差，例如在上图的数据集上建立回归，因最右边噪点的存在，使回归模型在训练集上表现都很差。这主要是由于线性回归在整个实数域内敏感度一致，而分类范围，需要在$[0,1]$。
 #
 # 逻辑回归就是一种减小预测范围，将预测值限定为$[0,1]$间的一种回归模型，其回归方程与回归曲线如图2所示。逻辑曲线在$z=0$时，十分敏感，在$z>>0$或$z<<0$处，都不敏感，将预测值限定为$(0,1)$。
 #
 # ![LogisticFunction](images/fig2.gif)
 #
 #

 # +
 # %matplotlib inline

 from __future__ import division
 import numpy as np
 import sklearn.datasets
 import matplotlib.pyplot as plt

 np.random.seed(0)


 # +
 # load sample data
 data, label = sklearn.datasets.make_moons(200, noise=0.30)

 print("data  = ", data[:10, :])
 print("label = ", label[:10])

 plt.scatter(data[:,0], data[:,1], c=label)
 plt.title("Original Data")

 # +
 def plot_decision_boundary(predict_func, data, label):
    """画出结果图
    Args:
        pred_func (callable): 预测函数
        data (numpy.ndarray): 训练数据集合
        label (numpy.ndarray): 训练数据标签
    """
    x_min, x_max = data[:, 0].min() - .5, data[:, 0].max() + .5
    y_min, y_max = data[:, 1].min() - .5, data[:, 1].max() + .5
    h = 0.01

    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

    Z = predict_func(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral)
    plt.scatter(data[:, 0], data[:, 1], c=label, cmap=plt.cm.Spectral)
    plt.show()



 # +
 def sigmoid(x):
    return 1.0 / (1 + np.exp(-x))

 class Logistic(object):
    """logistic回归模型"""
    def __init__(self, data, label):
        self.data = data
        self.label = label

        self.data_num, n = np.shape(data)
        self.weights = np.ones(n)
        self.b = 1

    def train(self, num_iteration=150):
        """随机梯度上升算法
        Args:
            data (numpy.ndarray): 训练数据集
            labels (numpy.ndarray): 训练标签
            num_iteration (int): 迭代次数
        """
        for j in range(num_iteration):
            data_index = list(range(self.data_num))
            for i in range(self.data_num):
                # 学习速率
                alpha = 0.01
                rand_index = int(np.random.uniform(0, len(data_index)))
                error = self.label[rand_index] - sigmoid(sum(self.data[rand_index] * self.weights + self.b))
                self.weights += alpha * error * self.data[rand_index]
                self.b += alpha * error
                del(data_index[rand_index])

    def predict(self, predict_data):
        """预测函数"""
        result = list(map(lambda x: 1 if sum(self.weights * x + self.b) > 0 else 0,
                     predict_data))
        return np.array(result)

 # -

 logistic = Logistic(data, label)
 logistic.train(200)
 plot_decision_boundary(lambda x: logistic.predict(x), data, label)

 # ## References
 #
 # * [逻辑回归模型(Logistic Regression, LR)基础](https://www.cnblogs.com/sparkwen/p/3441197.html)
 # * [逻辑回归（Logistic Regression）](http://www.cnblogs.com/BYRans/p/4713624.html)
--- a/1_logistic_regression/images/fig1.gif
+++ b/1_logistic_regression/images/fig1.gif
--- a/1_logistic_regression/images/fig2.gif
+++ b/1_logistic_regression/images/fig2.gif
--- a/1_logistic_regression/images/fig3.gif
+++ b/1_logistic_regression/images/fig3.gif
--- a/1_logistic_regression/linear
+++ b/1_logistic_regression/linear
--- a/1_logistic_regression/linear_regression.py
+++ b/1_logistic_regression/linear_regression.py
@@ -1,66 +0,0 @@

 import matplotlib.pyplot as plt
 import numpy as np
 import sklearn
 from sklearn import datasets

 # load data
 d = datasets.load_diabetes()

 X = d.data[:, 2]
 Y = d.target

 # draw original data
 plt.scatter(X, Y)
 plt.show()


 ###############################################################################
 # Least squares
 ###############################################################################

 # L = \sum_{i=1, N} (y_i - a*x_i - b)^2
 N = X.shape[0]

 S_X2 = np.sum(X*X)
 S_X  = np.sum(X)
 S_XY = np.sum(X*Y)
 S_Y  = np.sum(Y)

 A1 = np.array([[S_X2, S_X], [S_X, N]])
 B1 = np.array([S_XY, S_Y])

 coeff = np.linalg.inv(A1).dot(B1)

 x_min = np.min(X)
 x_max = np.max(X)
 y_min = coeff[0] * x_min + coeff[1]
 y_max = coeff[0] * x_max + coeff[1]

 plt.scatter(X, Y)
 plt.plot([x_min, x_max], [y_min, y_max], 'r')
 plt.show()


 ###############################################################################
 # Linear regression
 ###############################################################################
 # the loss function
 #   L = \sum_{i=1, N} (y_i - a*x_i - b)^2

 n_train = 1000


 a, b = 1, 1
 epsilon = 0.001

 for i in range(n_train):
    for j in range(N):
        a = a + epsilon*2*(Y[j] - a*X[j] - b)*X[j]
        b = b + epsilon*2*(Y[j] - a*X[j] - b)

    L = 0
    for j in range(N):
        L = L + (Y[j]-a*X[j]-b)**2
    print("epoch %4d: loss = %f" % (i, L))

--- a/1_logistic_regression/logistic3.py
+++ b/1_logistic_regression/logistic3.py
@@ -1,70 +0,0 @@
 # -*- coding=utf8 -*-
 from __future__ import division
 import numpy as np
 import sklearn.datasets
 import matplotlib.pyplot as plt

 np.random.seed(0)
 data, label = sklearn.datasets.make_moons(200, noise=0.30)

 def plot_decision_boundary(predict_func, data, label):
    """画出结果图
    Args:
        pred_func (callable): 预测函数
        data (numpy.ndarray): 训练数据集合
        label (numpy.ndarray): 训练数据标签
    """
    x_min, x_max = data[:, 0].min() - .5, data[:, 0].max() + .5
    y_min, y_max = data[:, 1].min() - .5, data[:, 1].max() + .5
    h = 0.01

    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

    Z = predict_func(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral)
    plt.scatter(data[:, 0], data[:, 1], c=label, cmap=plt.cm.Spectral)
    plt.show()

 def sigmoid(x):
    return 1.0 / (1 + np.exp(-x))

 class Logistic(object):
    """logistic回归模型"""
    def __init__(self, data, label):
        self.data = data
        self.label = label

        self.data_num, n = np.shape(data)
        self.weights = np.ones(n)
        self.b = 1

    def train(self, num_iteration=150):
        """随机梯度上升算法
        Args:
            data (numpy.ndarray): 训练数据集
            labels (numpy.ndarray): 训练标签
            num_iteration (int): 迭代次数
        """
        for j in range(num_iteration):
            data_index = list(range(self.data_num))
            for i in range(self.data_num):
                # 学习速率
                alpha = 0.01
                rand_index = int(np.random.uniform(0, len(data_index)))
                error = self.label[rand_index] - sigmoid(sum(self.data[rand_index] * self.weights + self.b))
                self.weights += alpha * error * self.data[rand_index]
                self.b += alpha * error
                del(data_index[rand_index])

    def predict(self, predict_data):
        """预测函数"""
        result = list(map(lambda x: 1 if sum(self.weights * x + self.b) > 0 else 0,
                     predict_data))
        return np.array(result)

 if __name__ == '__main__':
    logistic = Logistic(data, label)
    logistic.train(200)
 plot_decision_boundary(lambda x: logistic.predict(x), data, label)
--- a/1_logistic_regression/logistic_demo.py
+++ b/1_logistic_regression/logistic_demo.py
@@ -1,72 +0,0 @@
 # -*- coding=utf8 -*-
 from __future__ import division
 import numpy as np
 import sklearn.datasets
 import matplotlib.pyplot as plt

 np.random.seed(0)
 data, label = sklearn.datasets.make_moons(200, noise=0.30)

 def plot_decision_boundary(predict_func, data, label):
    """画出结果图
    Args:
        pred_func (callable): 预测函数
        data (numpy.ndarray): 训练数据集合
        label (numpy.ndarray): 训练数据标签
    """
    x_min, x_max = data[:, 0].min() - .5, data[:, 0].max() + .5
    y_min, y_max = data[:, 1].min() - .5, data[:, 1].max() + .5
    h = 0.01

    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

    Z = predict_func(np.c_[xx.ravel(), yy.ravel()])
    print(Z.shape)
    Z = Z.reshape(xx.shape)

    plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral)
    plt.scatter(data[:, 0], data[:, 1], c=label, cmap=plt.cm.Spectral)
    plt.show()

 def sigmoid(x):
    return 1.0 / (1 + np.exp(-x))

 class Logistic(object):
    """logistic回归模型"""
    def __init__(self, data, label):
        self.data = data
        self.label = label

        self.data_num, n = np.shape(data)
        self.weights = np.ones(n)
        self.b = 1

    def train(self, num_iteration=150):
        """随机梯度上升算法
        Args:
            data (numpy.ndarray): 训练数据集
            labels (numpy.ndarray): 训练标签
            num_iteration (int): 迭代次数
        """
        for j in range(num_iteration):
            data_index = range(self.data_num)
            for i in range(self.data_num):
                # 学习速率
                alpha = 0.01
                rand_index = int(np.random.uniform(0, len(data_index)))
                error = self.label[rand_index] - sigmoid(sum(self.data[rand_index] * self.weights + self.b))
                self.weights += alpha * error * self.data[rand_index]
                self.b += alpha * error
                
                
    def predict(self, predict_data):
        """预测函数"""
        result = map(lambda x: 1 if sum(self.weights * x + self.b) > 0 else 0,
                     predict_data)
        print(result)
        return np.array(result)

 if __name__ == '__main__':
    logistic = Logistic(data, label)
    logistic.train(200)
    plot_decision_boundary(lambda x: logistic.predict(x), data, label)