Add pytorch linear_regression and poly_fitting

7 years ago · 21c52740e8
--- a/2_pytorch/0_basic/autograd.ipynb
+++ b/2_pytorch/0_basic/autograd.ipynb
@@ -645,7 +645,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
--- a/2_pytorch/0_basic/autograd.py
+++ b/2_pytorch/0_basic/autograd.py
@@ -0,0 +1,220 @@
 # -*- coding: utf-8 -*-
 # ---
 # jupyter:
 #   jupytext_format_version: '1.2'
 #   kernelspec:
 #     display_name: Python 3
 #     language: python
 #     name: python3
 #   language_info:
 #     codemirror_mode:
 #       name: ipython
 #       version: 3
 #     file_extension: .py
 #     mimetype: text/x-python
 #     name: python
 #     nbconvert_exporter: python
 #     pygments_lexer: ipython3
 #     version: 3.5.2
 # ---
 # # 自动求导
 # 这次课程我们会了解 PyTorch 中的自动求导机制，自动求导是 PyTorch 中非常重要的特性，能够让我们避免手动去计算非常复杂的导数，这能够极大地减少了我们构建模型的时间，这也是其前身 Torch 这个框架所不具备的特性，下面我们通过例子看看 PyTorch 自动求导的独特魅力以及探究自动求导的更多用法。
 import torch
 from torch.autograd import Variable
 # ## 简单情况的自动求导
 # 下面我们显示一些简单情况的自动求导，"简单"体现在计算的结果都是标量，也就是一个数，我们对这个标量进行自动求导。
 x = Variable(torch.Tensor([2]), requires_grad=True)
 y = x + 2
 z = y ** 2 + 3
 print(z)
 # 通过上面的一些列操作，我们从 x 得到了最后的结果out，我们可以将其表示为数学公式
 #
 # $$
 # z = (x + 2)^2 + 3
 # $$
 #
 # 那么我们从 z 对 x 求导的结果就是 
 #
 # $$
 # \frac{\partial z}{\partial x} = 2 (x + 2) = 2 (2 + 2) = 8
 # $$
 # 如果你对求导不熟悉，可以查看以下[网址进行复习](https://baike.baidu.com/item/%E5%AF%BC%E6%95%B0#1)
 # 使用自动求导
 z.backward()
 print(x.grad)
 # 对于上面这样一个简单的例子，我们验证了自动求导，同时可以发现发现使用自动求导非常方便。如果是一个更加复杂的例子，那么手动求导就会显得非常的麻烦，所以自动求导的机制能够帮助我们省去麻烦的数学计算，下面我们可以看一个更加复杂的例子。
 # +
 x = Variable(torch.randn(10, 20), requires_grad=True)
 y = Variable(torch.randn(10, 5), requires_grad=True)
 w = Variable(torch.randn(20, 5), requires_grad=True)
 out = torch.mean(y - torch.matmul(x, w)) # torch.matmul 是做矩阵乘法
 out.backward()
 # -
 # 如果你对矩阵乘法不熟悉，可以查看下面的[网址进行复习](https://baike.baidu.com/item/%E7%9F%A9%E9%98%B5%E4%B9%98%E6%B3%95/5446029?fr=aladdin)
 # 得到 x 的梯度
 print(x.grad)
 # 得到 y 的的梯度
 print(y.grad)
 # 得到 w 的梯度
 print(w.grad)
 # 上面数学公式就更加复杂，矩阵乘法之后对两个矩阵对应元素相乘，然后所有元素求平均，有兴趣的同学可以手动去计算一下梯度，使用 PyTorch 的自动求导，我们能够非常容易得到 x, y 和 w 的导数，因为深度学习中充满大量的矩阵运算，所以我们没有办法手动去求这些导数，有了自动求导能够非常方便地解决网络更新的问题。
 #
 #
 # ## 复杂情况的自动求导
 # 上面我们展示了简单情况下的自动求导，都是对标量进行自动求导，可能你会有一个疑问，如何对一个向量或者矩阵自动求导了呢？感兴趣的同学可以自己先去尝试一下，下面我们会介绍对多维数组的自动求导机制。
 m = Variable(torch.FloatTensor([[2, 3]]), requires_grad=True) # 构建一个 1 x 2 的矩阵
 n = Variable(torch.zeros(1, 2)) # 构建一个相同大小的 0 矩阵
 print(m)
 print(n)
 # 通过 m 中的值计算新的 n 中的值
 n[0, 0] = m[0, 0] ** 2
 n[0, 1] = m[0, 1] ** 3
 print(n)
 # 将上面的式子写成数学公式，可以得到 
 # $$
 # n = (n_0,\ n_1) = (m_0^2,\ m_1^3) = (2^2,\ 3^3) 
 # $$
 # 下面我们直接对 n 进行反向传播，也就是求 n 对 m 的导数。
 #
 # 这时我们需要明确这个导数的定义，即如何定义
 #
 # $$
 # \frac{\partial n}{\partial m} = \frac{\partial (n_0,\ n_1)}{\partial (m_0,\ m_1)}
 # $$
 #
 # 在 PyTorch 中，如果要调用自动求导，需要往`backward()`中传入一个参数，这个参数的形状和 n 一样大，比如是 $(w_0,\ w_1)$，那么自动求导的结果就是：
 # $$
 # \frac{\partial n}{\partial m_0} = w_0 \frac{\partial n_0}{\partial m_0} + w_1 \frac{\partial n_1}{\partial m_0}
 # $$
 # $$
 # \frac{\partial n}{\partial m_1} = w_0 \frac{\partial n_0}{\partial m_1} + w_1 \frac{\partial n_1}{\partial m_1}
 # $$
 n.backward(torch.ones_like(n)) # 将 (w0, w1) 取成 (1, 1)
 print(m.grad)
 # 通过自动求导我们得到了梯度是 4 和 27，我们可以验算一下
 # $$
 # \frac{\partial n}{\partial m_0} = w_0 \frac{\partial n_0}{\partial m_0} + w_1 \frac{\partial n_1}{\partial m_0} = 2 m_0 + 0 = 2 \times 2 = 4
 # $$
 # $$
 # \frac{\partial n}{\partial m_1} = w_0 \frac{\partial n_0}{\partial m_1} + w_1 \frac{\partial n_1}{\partial m_1} = 0 + 3 m_1^2 = 3 \times 3^2 = 27
 # $$
 # 通过验算我们可以得到相同的结果
 #
 #
 # ## 多次自动求导
 # 通过调用 backward 我们可以进行一次自动求导，如果我们再调用一次 backward，会发现程序报错，没有办法再做一次。这是因为 PyTorch 默认做完一次自动求导之后，计算图就被丢弃了，所以两次自动求导需要手动设置一个东西，我们通过下面的小例子来说明。
 x = Variable(torch.FloatTensor([3]), requires_grad=True)
 y = x * 2 + x ** 2 + 3
 print(y)
 y.backward(retain_graph=True) # 设置 retain_graph 为 True 来保留计算图
 print(x.grad)
 y.backward() # 再做一次自动求导，这次不保留计算图
 print(x.grad)
 # 可以发现 x 的梯度变成了 16，因为这里做了两次自动求导，所以讲第一次的梯度 8 和第二次的梯度 8 加起来得到了 16 的结果。
 #
 #
 # **小练习**
 #
 # 定义
 #
 # $$
 # x = 
 # \left[
 # \begin{matrix}
 # x_0 \\
 # x_1
 # \end{matrix}
 # \right] = 
 # \left[
 # \begin{matrix}
 # 2 \\
 # 3
 # \end{matrix}
 # \right]
 # $$
 #
 # $$
 # k = (k_0,\ k_1) = (x_0^2 + 3 x_1,\ 2 x_0 + x_1^2)
 # $$
 #
 # 我们希望求得
 #
 # $$
 # j = \left[
 # \begin{matrix}
 # \frac{\partial k_0}{\partial x_0} & \frac{\partial k_0}{\partial x_1} \\
 # \frac{\partial k_1}{\partial x_0} & \frac{\partial k_1}{\partial x_1}
 # \end{matrix}
 # \right]
 # $$
 #
 # 参考答案：
 #
 # $$
 # \left[
 # \begin{matrix}
 # 4 & 3 \\
 # 2 & 6 \\
 # \end{matrix}
 # \right]
 # $$
 # +
 x = Variable(torch.FloatTensor([2, 3]), requires_grad=True)
 k = Variable(torch.zeros(2))
 k[0] = x[0] ** 2 + 3 * x[1]
 k[1] = x[1] ** 2 + 2 * x[0]
 # -
 print(k)
 # +
 j = torch.zeros(2, 2)
 k.backward(torch.FloatTensor([1, 0]), retain_graph=True)
 j[0] = x.grad.data
 x.grad.data.zero_() # 归零之前求得的梯度
 k.backward(torch.FloatTensor([0, 1]))
 j[1] = x.grad.data
 # -
 print(j)
 # 下一次课我们会介绍两种神经网络的编程方式，动态图编程和静态图编程
--- a/2_pytorch/1_NN/linear-regression-gradient-descend.ipynb
+++ b/2_pytorch/1_NN/linear-regression-gradient-descend.ipynb
--- a/2_pytorch/1_NN/linear-regression-gradient-descend.py
+++ b/2_pytorch/1_NN/linear-regression-gradient-descend.py
@@ -0,0 +1,355 @@
 # -*- coding: utf-8 -*-
 # ---
 # jupyter:
 #   jupytext_format_version: '1.2'
 #   kernelspec:
 #     display_name: Python 3
 #     language: python
 #     name: python3
 #   language_info:
 #     codemirror_mode:
 #       name: ipython
 #       version: 3
 #     file_extension: .py
 #     mimetype: text/x-python
 #     name: python
 #     nbconvert_exporter: python
 #     pygments_lexer: ipython3
 #     version: 3.5.2
 # ---
 # # 线性模型和梯度下降
 # 这是神经网络的第一课，我们会学习一个非常简单的模型，线性回归，同时也会学习一个优化算法-梯度下降法，对这个模型进行优化。线性回归是监督学习里面一个非常简单的模型，同时梯度下降也是深度学习中应用最广的优化算法，我们将从这里开始我们的深度学习之旅
 #
 #
 # ## 一元线性回归
 # 一元线性模型非常简单，假设我们有变量 $x_i$ 和目标 $y_i$，每个 i 对应于一个数据点，希望建立一个模型
 #
 # $$
 # \hat{y}_i = w x_i + b
 # $$
 #
 # $\hat{y}_i$ 是我们预测的结果，希望通过 $\hat{y}_i$ 来拟合目标 $y_i$，通俗来讲就是找到这个函数拟合 $y_i$ 使得误差最小，即最小化
 #
 # $$
 # \frac{1}{n} \sum_{i=1}^n(\hat{y}_i - y_i)^2
 # $$
 # 那么如何最小化这个误差呢？
 #
 # 这里需要用到**梯度下降**，这是我们接触到的第一个优化算法，非常简单，但是却非常强大，在深度学习中被大量使用，所以让我们从简单的例子出发了解梯度下降法的原理
 # ## 梯度下降法
 # 在梯度下降法中，我们首先要明确梯度的概念，随后我们再了解如何使用梯度进行下降。
 # ### 梯度
 # 梯度在数学上就是导数，如果是一个多元函数，那么梯度就是偏导数。比如一个函数f(x, y)，那么 f 的梯度就是 
 #
 # $$
 # (\frac{\partial f}{\partial x},\ \frac{\partial f}{\partial y})
 # $$
 #
 # 可以称为 grad f(x, y) 或者 $\nabla f(x, y)$。具体某一点 $(x_0,\ y_0)$ 的梯度就是 $\nabla f(x_0,\ y_0)$。
 #
 # 下面这个图片是 $f(x) = x^2$ 这个函数在 x=1 处的梯度
 #
 # ![](https://ws3.sinaimg.cn/large/006tNc79ly1fmarbuh2j3j30ba0b80sy.jpg)
 # 梯度有什么意义呢？从几何意义来讲，一个点的梯度值是这个函数变化最快的地方，具体来说，对于函数 f(x, y)，在点 $(x_0, y_0)$ 处，沿着梯度 $\nabla f(x_0,\ y_0)$ 的方向，函数增加最快，也就是说沿着梯度的方向，我们能够更快地找到函数的极大值点，或者反过来沿着梯度的反方向，我们能够更快地找到函数的最小值点。
 # ### 梯度下降法
 # 有了对梯度的理解，我们就能了解梯度下降发的原理了。上面我们需要最小化这个误差，也就是需要找到这个误差的最小值点，那么沿着梯度的反方向我们就能够找到这个最小值点。
 #
 # 我们可以来看一个直观的解释。比如我们在一座大山上的某处位置，由于我们不知道怎么下山，于是决定走一步算一步，也就是在每走到一个位置的时候，求解当前位置的梯度，沿着梯度的负方向，也就是当前最陡峭的位置向下走一步，然后继续求解当前位置梯度，向这一步所在位置沿着最陡峭最易下山的位置走一步。这样一步步的走下去，一直走到觉得我们已经到了山脚。当然这样走下去，有可能我们不能走到山脚，而是到了某一个局部的山峰低处。
 #
 # 类比我们的问题，就是沿着梯度的反方向，我们不断改变 w 和 b 的值，最终找到一组最好的 w 和 b 使得误差最小。
 #
 # 在更新的时候，我们需要决定每次更新的幅度，比如在下山的例子中，我们需要每次往下走的那一步的长度，这个长度称为学习率，用 $\eta$ 表示，这个学习率非常重要，不同的学习率都会导致不同的结果，学习率太小会导致下降非常缓慢，学习率太大又会导致跳动非常明显，可以看看下面的例子
 #
 # ![](https://ws2.sinaimg.cn/large/006tNc79ly1fmgn23lnzjg30980gogso.gif)
 #
 # 可以看到上面的学习率较为合适，而下面的学习率太大，就会导致不断跳动
 #
 # 最后我们的更新公式就是
 #
 # $$
 # w := w - \eta \frac{\partial f(w,\ b)}{\partial w} \\
 # b := b - \eta \frac{\partial f(w,\ b)}{\partial b}
 # $$
 #
 # 通过不断地迭代更新，最终我们能够找到一组最优的 w 和 b，这就是梯度下降法的原理。
 #
 # 最后可以通过这张图形象地说明一下这个方法
 #
 # ![](https://ws3.sinaimg.cn/large/006tNc79ly1fmarxsltfqj30gx091gn4.jpg)
 #
 #
 # 上面是原理部分，下面通过一个例子来进一步学习线性模型
 # +
 import torch
 import numpy as np
 from torch.autograd import Variable
 torch.manual_seed(2017)
 # +
 # 读入数据 x 和 y
 x_train = np.array([[3.3], [4.4], [5.5], [6.71], [6.93], [4.168],
                    [9.779], [6.182], [7.59], [2.167], [7.042],
                    [10.791], [5.313], [7.997], [3.1]], dtype=np.float32)
 y_train = np.array([[1.7], [2.76], [2.09], [3.19], [1.694], [1.573],
                    [3.366], [2.596], [2.53], [1.221], [2.827],
                    [3.465], [1.65], [2.904], [1.3]], dtype=np.float32)
 # +
 # 画出图像
 import matplotlib.pyplot as plt
 # %matplotlib inline
 plt.plot(x_train, y_train, 'bo')
 # +
 # 转换成 Tensor
 x_train = torch.from_numpy(x_train)
 y_train = torch.from_numpy(y_train)
 # 定义参数 w 和 b
 w = Variable(torch.randn(1), requires_grad=True) # 随机初始化
 b = Variable(torch.zeros(1), requires_grad=True) # 使用 0 进行初始化
 # +
 # 构建线性回归模型
 x_train = Variable(x_train)
 y_train = Variable(y_train)
 def linear_model(x):
    return x * w + b
 # -
 y_ = linear_model(x_train)
 # 经过上面的步骤我们就定义好了模型，在进行参数更新之前，我们可以先看看模型的输出结果长什么样
 plt.plot(x_train.data.numpy(), y_train.data.numpy(), 'bo', label='real')
 plt.plot(x_train.data.numpy(), y_.data.numpy(), 'ro', label='estimated')
 plt.legend()
 # **思考：红色的点表示预测值，似乎排列成一条直线，请思考一下这些点是否在一条直线上？**
 # 这个时候需要计算我们的误差函数，也就是
 #
 # $$
 # \frac{1}{n} \sum_{i=1}^n(\hat{y}_i - y_i)^2
 # $$
 # +
 # 计算误差
 def get_loss(y_, y):
    return torch.mean((y_ - y) ** 2)
 loss = get_loss(y_, y_train)
 # -
 # 打印一下看看 loss 的大小
 print(loss)
 # 定义好了误差函数，接下来我们需要计算 w 和 b 的梯度了，这时得益于 PyTorch 的自动求导，我们不需要手动去算梯度，有兴趣的同学可以手动计算一下，w 和 b 的梯度分别是
 #
 # $$
 # \frac{\partial}{\partial w} = \frac{2}{n} \sum_{i=1}^n x_i(w x_i + b - y_i) \\
 # \frac{\partial}{\partial b} = \frac{2}{n} \sum_{i=1}^n (w x_i + b - y_i)
 # $$
 # 自动求导
 loss.backward()
 # 查看 w 和 b 的梯度
 print(w.grad)
 print(b.grad)
 # 更新一次参数
 w.data = w.data - 1e-2 * w.grad.data
 b.data = b.data - 1e-2 * b.grad.data
 # 更新完成参数之后，我们再一次看看模型输出的结果
 y_ = linear_model(x_train)
 plt.plot(x_train.data.numpy(), y_train.data.numpy(), 'bo', label='real')
 plt.plot(x_train.data.numpy(), y_.data.numpy(), 'ro', label='estimated')
 plt.legend()
 # 从上面的例子可以看到，更新之后红色的线跑到了蓝色的线下面，没有特别好的拟合蓝色的真实值，所以我们需要在进行几次更新
 for e in range(10): # 进行 10 次更新
    y_ = linear_model(x_train)
    loss = get_loss(y_, y_train)
    w.grad.zero_() # 记得归零梯度
    b.grad.zero_() # 记得归零梯度
    loss.backward()
    w.data = w.data - 1e-2 * w.grad.data # 更新 w
    b.data = b.data - 1e-2 * b.grad.data # 更新 b 
    print('epoch: {}, loss: {}'.format(e, loss.data[0]))
 y_ = linear_model(x_train)
 plt.plot(x_train.data.numpy(), y_train.data.numpy(), 'bo', label='real')
 plt.plot(x_train.data.numpy(), y_.data.numpy(), 'ro', label='estimated')
 plt.legend()
 # 经过 10 次更新，我们发现红色的预测结果已经比较好的拟合了蓝色的真实值。
 #
 # 现在你已经学会了你的第一个机器学习模型了，再接再厉，完成下面的小练习。
 # **小练习：**
 #
 # 重启 notebook 运行上面的线性回归模型，但是改变训练次数以及不同的学习率进行尝试得到不同的结果
 # ## 多项式回归模型
 # 下面我们更进一步，讲一讲多项式回归。什么是多项式回归呢？非常简单，根据上面的线性回归模型
 #
 # $$
 # \hat{y} = w x + b
 # $$
 #
 # 这里是关于 x 的一个一次多项式，这个模型比较简单，没有办法拟合比较复杂的模型，所以我们可以使用更高次的模型，比如
 #
 # $$
 # \hat{y} = w_0 + w_1 x + w_2 x^2 + w_3 x^3 + \cdots
 # $$
 #
 # 这样就能够拟合更加复杂的模型，这就是多项式模型，这里使用了 x 的更高次，同理还有多元回归模型，形式也是一样的，只是出了使用 x，还是更多的变量，比如 y、z 等等，同时他们的 loss 函数和简单的线性回归模型是一致的。
 #
 #
 # 首先我们可以先定义一个需要拟合的目标函数，这个函数是个三次的多项式
 # +
 # 定义一个多变量函数
 w_target = np.array([0.5, 3, 2.4]) # 定义参数
 b_target = np.array([0.9]) # 定义参数
 f_des = 'y = {:.2f} + {:.2f} * x + {:.2f} * x^2 + {:.2f} * x^3'.format(
    b_target[0], w_target[0], w_target[1], w_target[2]) # 打印出函数的式子
 print(f_des)
 # -
 # 我们可以先画出这个多项式的图像
 # +
 # 画出这个函数的曲线
 x_sample = np.arange(-3, 3.1, 0.1)
 y_sample = b_target[0] + w_target[0] * x_sample + w_target[1] * x_sample ** 2 + w_target[2] * x_sample ** 3
 plt.plot(x_sample, y_sample, label='real curve')
 plt.legend()
 # -
 # 接着我们可以构建数据集，需要 x 和 y，同时是一个三次多项式，所以我们取了 $x,\ x^2, x^3$
 # +
 # 构建数据 x 和 y
 # x 是一个如下矩阵 [x, x^2, x^3]
 # y 是函数的结果 [y]
 x_train = np.stack([x_sample ** i for i in range(1, 4)], axis=1)
 x_train = torch.from_numpy(x_train).float() # 转换成 float tensor
 y_train = torch.from_numpy(y_sample).float().unsqueeze(1) # 转化成 float tensor 
 # -
 # 接着我们可以定义需要优化的参数，就是前面这个函数里面的 $w_i$
 # +
 # 定义参数和模型
 w = Variable(torch.randn(3, 1), requires_grad=True)
 b = Variable(torch.zeros(1), requires_grad=True)
 # 将 x 和 y 转换成 Variable
 x_train = Variable(x_train)
 y_train = Variable(y_train)
 def multi_linear(x):
    return torch.mm(x, w) + b
 # -
 # 我们可以画出没有更新之前的模型和真实的模型之间的对比
 # +
 # 画出更新之前的模型
 y_pred = multi_linear(x_train)
 plt.plot(x_train.data.numpy()[:, 0], y_pred.data.numpy(), label='fitting curve', color='r')
 plt.plot(x_train.data.numpy()[:, 0], y_sample, label='real curve', color='b')
 plt.legend()
 # -
 # 可以发现，这两条曲线之间存在差异，我们计算一下他们之间的误差
 # 计算误差，这里的误差和一元的线性模型的误差是相同的，前面已经定义过了 get_loss
 loss = get_loss(y_pred, y_train)
 print(loss)
 # 自动求导
 loss.backward()
 # 查看一下 w 和 b 的梯度
 print(w.grad)
 print(b.grad)
 # 更新一下参数
 w.data = w.data - 0.001 * w.grad.data
 b.data = b.data - 0.001 * b.grad.data
 # +
 # 画出更新一次之后的模型
 y_pred = multi_linear(x_train)
 plt.plot(x_train.data.numpy()[:, 0], y_pred.data.numpy(), label='fitting curve', color='r')
 plt.plot(x_train.data.numpy()[:, 0], y_sample, label='real curve', color='b')
 plt.legend()
 # -
 # 因为只更新了一次，所以两条曲线之间的差异仍然存在，我们进行 100 次迭代
 # 进行 100 次参数更新
 for e in range(100):
    y_pred = multi_linear(x_train)
    loss = get_loss(y_pred, y_train)
    w.grad.data.zero_()
    b.grad.data.zero_()
    loss.backward()
    # 更新参数
    w.data = w.data - 0.001 * w.grad.data
    b.data = b.data - 0.001 * b.grad.data
    if (e + 1) % 20 == 0:
        print('epoch {}, Loss: {:.5f}'.format(e+1, loss.data[0]))
 # 可以看到更新完成之后 loss 已经非常小了，我们画出更新之后的曲线对比
 # +
 # 画出更新之后的结果
 y_pred = multi_linear(x_train)
 plt.plot(x_train.data.numpy()[:, 0], y_pred.data.numpy(), label='fitting curve', color='r')
 plt.plot(x_train.data.numpy()[:, 0], y_sample, label='real curve', color='b')
 plt.legend()
 # -
 # 可以看到，经过 100 次更新之后，可以看到拟合的线和真实的线已经完全重合了
 # **小练习：上面的例子是一个三次的多项式，尝试使用二次的多项式去拟合它，看看最后能做到多好**
 #
 # **提示：参数 `w = torch.randn(2, 1)`，同时重新构建 x 数据集**
--- a/demo_code/2_linear_regression.py
+++ b/demo_code/2_linear_regression.py
@@ -0,0 +1,72 @@
 import numpy as np
 import torch
 from torch.autograd import Variable
 import matplotlib.pyplot as plt
 """
 Using pytorch to do linear regression
 """
 torch.manual_seed(2018)
 # model's real-parameters
 w_target = 3
 b_target = 10
 # generate data
 n_data = 100
 x_train = np.random.rand(n_data, 1)*20 - 10
 y_train = w_target*x_train + b_target + (np.random.randn(n_data, 1)*10-5.0)
 # draw the data
 plt.plot(x_train, y_train, 'bo')
 plt.show()
 # convert to tensor
 x_train = torch.from_numpy(x_train).float()
 y_train = torch.from_numpy(y_train).float()
 # define model parameters
 w = Variable(torch.randn(1).float(), requires_grad=True)
 b = Variable(torch.zeros(1).float(), requires_grad=True)
 # construct the linear model
 x_train = Variable(x_train)
 y_train = Variable(y_train)
 # define model's function
 def linear_model(x):
    return x*w + b
 # define the loss function
 def get_loss(y_pred, y):
    return torch.mean((y_pred - y)**2)
 # upgrade parameters
 eta = 1e-2
 for i in range(100):
    y_pred = linear_model(x_train)
    loss = get_loss(y_pred, y_train)
    loss.backward()
    w.data = w.data - eta*w.grad.data
    b.data = b.data - eta*b.grad.data
    w.grad.zero_()
    b.grad.zero_()
    if i % 10 == 0:
        print("epoch: %3d, loss: %f" % (i, loss.data[0]))
 # draw the results
 plt.plot(x_train.data.numpy(), y_train.data.numpy(), 'bo', label="Real")
 plt.plot(x_train.data.numpy(), y_pred.data.numpy(), 'ro', label="Estimated")
 plt.legend()
 plt.show()
--- a/demo_code/2_linear_regression_0.py
+++ b/demo_code/2_linear_regression_0.py
@@ -0,0 +1,92 @@
 import numpy as np
 import torch
 from torch.autograd import Variable
 import matplotlib.pyplot as plt
 """
 Using pytorch to do linear regression
 """
 torch.manual_seed(2018)
 # generate data
 x_train = np.array([[3.3], [4.4], [5.5], [6.71], [6.93], [4.168],
                    [9.779], [6.182], [7.59], [2.167], [7.042],
                    [10.791], [5.313], [7.997], [3.1]], dtype=np.float32)
 y_train = np.array([[1.7], [2.76], [2.09], [3.19], [1.694], [1.573],
                    [3.366], [2.596], [2.53], [1.221], [2.827],
                    [3.465], [1.65], [2.904], [1.3]], dtype=np.float32)
 # draw the data
 plt.plot(x_train, y_train, 'bo')
 plt.show()
 # convert to tensor
 x_train = torch.from_numpy(x_train)
 y_train = torch.from_numpy(y_train)
 # define model parameters
 w = Variable(torch.randn(1), requires_grad=True)
 b = Variable(torch.zeros(1), requires_grad=True)
 # construct the linear model
 x_train = Variable(x_train)
 y_train = Variable(y_train)
 def linear_model(x):
    return x*w + b
 # first predictive
 y_pred = linear_model(x_train)
 # draw the real & predictived data
 plt.plot(x_train.data.numpy(), y_train.data.numpy(), 'bo', label="Real")
 plt.plot(x_train.data.numpy(), y_pred.data.numpy(), 'ro', label="Estimated")
 plt.legend()
 plt.show()
 # define the loss function
 def get_loss(y_pred, y):
    return torch.mean((y_pred - y)**2)
 loss = get_loss(y_pred, y_train)
 print("loss = %f" % float(loss))
 # auto-grad
 loss.backward()
 print("w.grad = %f" % float(w.grad))
 print("b.grad = %f" % float(b.grad))
 # upgrade parameters
 eta = 1e-2
 w.data = w.data - eta*w.grad.data
 b.data = b.data - eta*w.grad.data
 y_pred = linear_model(x_train)
 plt.plot(x_train.data.numpy(), y_train.data.numpy(), 'bo', label="Real")
 plt.plot(x_train.data.numpy(), y_pred.data.numpy(), 'ro', label="Estimated")
 plt.legend()
 plt.show()
 for i in range(10):
    y_pred = linear_model(x_train)
    loss = get_loss(y_pred, y_train)
    w.grad.zero_()
    b.grad.zero_()
    loss.backward()
    w.data = w.data - eta*w.grad.data
    b.data = b.data - eta*b.grad.data
    print("epoch: %3d, loss: %f" % (i, loss.data[0]))
--- a/demo_code/2_poly_fitting.py
+++ b/demo_code/2_poly_fitting.py
@@ -0,0 +1,77 @@
 import numpy as np
 import torch
 from torch.autograd import Variable
 import matplotlib.pyplot as plt
 """
 Polynomial fitting by pytorch
 """
 # define the real model's parameters
 w_target = np.array([0.5, 3, 2.4])
 b_target = np.array([0.9])
 f_des = "y = %f + %f * x + %f * x^2 + %f * x^3" % (
    b_target[0],
    w_target[0], w_target[1], w_target[2])
 print(f_des)
 # draw the data
 x_sample = np.arange(-3, 3.1, 0.1)
 y_sample = b_target[0] + w_target[0]*x_sample + w_target[1]*x_sample**2 + w_target[2]*x_sample**3
 plt.plot(x_sample, y_sample, label="Real")
 plt.legend()
 plt.show()
 # construct variabels
 x_train = np.stack([x_sample**i for i in range(1, 4)], axis=1)
 x_train = torch.from_numpy(x_train).float()
 y_train = torch.from_numpy(y_sample).float().unsqueeze(1)
 # define model parameters
 w = Variable(torch.randn(3, 1).float(), requires_grad=True)
 b = Variable(torch.zeros(1).float(), requires_grad=True)
 x_train = Variable(x_train)
 y_train = Variable(y_train)
 # define the model function & loss function
 def polynomial(x):
    return torch.mm(x, w) + b
 def get_loss(y_pred, y):
    return torch.mean((y_pred-y)**2)
 # begin iterative optimization
 eta = 0.001
 for i in range(100):
    y_pred = polynomial(x_train)
    loss = get_loss(y_pred, y_train)
    loss.backward()
    w.data = w.data - eta*w.grad.data
    b.data = b.data - eta*b.grad.data
    w.grad.data.zero_()
    b.grad.data.zero_()
    if i % 10 == 0:
        print("epoch: %4d, loss: %f" % (i, loss.data[0]))
 # draw the results
 y_pred = polynomial(x_train)
 plt.plot(x_train.data.numpy()[:, 0], y_sample, label="Real", color='b')
 plt.plot(x_train.data.numpy()[:, 0], y_pred.data.numpy(), label="Fitting", color='r')
 plt.legend()
 plt.show()
--- a/demo_code/2_poly_fitting_0.py
+++ b/demo_code/2_poly_fitting_0.py
@@ -0,0 +1,105 @@
 import numpy as np
 import torch
 from torch.autograd import Variable
 import matplotlib.pyplot as plt
 """
 Polynomial fitting by pytorch
 """
 # define the model's parameters
 w_target = np.array([0.5, 3, 2.4])
 b_target = np.array([0.9])
 f_des = "y = %f + %f * x + %f * x^2 + %f * x^3" % (
    b_target[0],
    w_target[0], w_target[1], w_target[2])
 print(f_des)
 # draw the data
 x_sample = np.arange(-3, 3.1, 0.1)
 y_sample = b_target[0] + w_target[0]*x_sample + w_target[1]*x_sample**2 + w_target[2]*x_sample**3
 plt.plot(x_sample, y_sample, label="Real")
 plt.legend()
 plt.show()
 # construct variabels
 x_train = np.stack([x_sample**i for i in range(1, 4)], axis=1)
 x_train = torch.from_numpy(x_train).float()
 y_train = torch.from_numpy(y_sample).float().unsqueeze(1)
 # define model parameters
 w = Variable(torch.randn(3, 1).float(), requires_grad=True)
 b = Variable(torch.zeros(1).float(), requires_grad=True)
 x_train = Variable(x_train)
 y_train = Variable(y_train)
 print(w.shape)
 print(b.shape)
 print(x_train.shape)
 print(y_train.shape)
 def polynomial(x):
    return torch.mm(x, w) + b
 def get_loss(y_pred, y):
    return torch.mean((y_pred-y)**2)
 # draw initial graph
 y_pred = polynomial(x_train)
 plt.plot(x_train.data.numpy()[:, 0], y_sample, label="Real", color='b')
 plt.plot(x_train.data.numpy()[:, 0], y_pred.data.numpy(), label="Fitting", color='r')
 plt.legend()
 plt.show()
 # compute loss
 loss = get_loss(y_pred, y_train)
 print("Loss = %f" % loss)
 loss.backward()
 print(w.grad)
 print(b.grad)
 eta = 0.001
 w.data = w.data - eta*w.grad.data
 b.data = b.data - eta*b.grad.data
 # second draw
 y_pred = polynomial(x_train)
 plt.plot(x_train.data.numpy()[:, 0], y_sample, label="Real", color='b')
 plt.plot(x_train.data.numpy()[:, 0], y_pred.data.numpy(), label="Fitting", color='r')
 plt.legend()
 plt.show()
 for i in range(100):
    y_pred = polynomial(x_train)
    loss = get_loss(y_pred, y_train)
    w.grad.data.zero_()
    b.grad.data.zero_()
    loss.backward()
    w.data = w.data - eta*w.grad.data
    b.data = b.data - eta*b.grad.data
    print("epoch: %4d, loss: %f" % (i, loss.data[0]))
 # second draw
 y_pred = polynomial(x_train)
 plt.plot(x_train.data.numpy()[:, 0], y_sample, label="Real", color='b')
 plt.plot(x_train.data.numpy()[:, 0], y_pred.data.numpy(), label="Fitting", color='r')
 plt.legend()
 plt.show()
--- a/demo_code/CNN_CIFAR.py
+++ b/demo_code/CNN_CIFAR.py
@@ -34,7 +34,7 @@ trainloader = t.utils.data.DataLoader(
 # 测试集
 testset = tv.datasets.CIFAR10(
                    dataset_path, train=False, download=True, transform=transform)
                    root=dataset_path, train=False, download=True, transform=transform)
 testloader = t.utils.data.DataLoader(
                    testset,
@@ -69,7 +69,7 @@ class Net(nn.Module):
 net = Net()
 print(net)
 criterion = nn.CrossEntropyLoss() # 交叉熵损失函数
 criterion = nn.CrossEntropyLoss()
 optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
 t.set_num_threads(8)
--- a/demo_code/Neural_Network.0.py
+++ b/demo_code/Neural_Network.0.py
@@ -1,16 +1,18 @@
 import torch
 from torch import nn, optim
 from torch.autograd import Variable
 from torch.utils.data import DataLoader
 import torch.nn.functional as F
 from torchvision import transforms
 from torchvision import datasets
 # set parameters
 batch_size      = 32
 learning_rate   = 1e-2
 num_epoches     = 50
 # 下载训练集 MNIST 手写数字训练集
 # download & load MNIST dataset
 dataset_path = "../data/mnist"
 train_dataset = datasets.MNIST(
@@ -23,70 +25,62 @@ train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
 test_loader  = DataLoader(test_dataset,  batch_size=batch_size, shuffle=False)
 # 定义简单的前馈神经网络
 class Neuralnetwork(nn.Module):
 # Define the network
 class NeuralNetwork(nn.Module):
    def __init__(self, in_dim, n_hidden_1, n_hidden_2, out_dim):
        super(Neuralnetwork, self).__init__()
        super(NeuralNetwork, self).__init__()
        self.layer1 = nn.Linear(in_dim, n_hidden_1)
        self.layer2 = nn.Linear(n_hidden_1, n_hidden_2)
        self.layer3 = nn.Linear(n_hidden_2, out_dim)
    def forward(self, x):
        x = self.layer1(x)
        x = self.layer2(x)
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        x = self.layer3(x)
        return x
 model = Neuralnetwork(28 * 28, 300, 100, 10)
 if torch.cuda.is_available():
    model = model.cuda()
 # create network & define loss function
 model = NeuralNetwork(28 * 28, 300, 100, 10)
 criterion = nn.CrossEntropyLoss()
 optimizer = optim.SGD(model.parameters(), lr=learning_rate)
 # train
 for epoch in range(num_epoches):
    print('epoch {}'.format(epoch + 1))
    print('*' * 10)
    print("epoch %6d" % int(epoch+1))
    print('-' * 40)
    running_loss = 0.0
    running_acc = 0.0
    for i, data in enumerate(train_loader, 1):
        # FIXME: label need to change one-hot coding
        img, label = data
        img = img.view(img.size(0), -1)
        target = torch.zeros(label.size(0), 10)
        target = target.scatter_(1, label.data, 1)
        if torch.cuda.is_available():
            img = Variable(img).cuda()
            label = Variable(label).cuda()
        else:
            img = Variable(img)
            label = Variable(label)
        img   = Variable(img.view(img.size(0), -1))
        label = Variable(label)
        # 向前传播
        optimizer.zero_grad()
        out = model(img)
        loss = criterion(out, label)
        running_loss += loss.data[0] * label.size(0)
        _, pred = torch.max(out, 1)
        num_correct = (pred == label).sum()
        running_acc += num_correct.data[0]
        pred = out.data.max(1, keepdim=True)[1]
        running_acc += float(pred.eq(label.data.view_as(pred)).cpu().sum())
        # 向后传播
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if i % 300 == 0:
            print('[{}/{}] Loss: {:.6f}, Acc: {:.6f}'.format(
                epoch + 1, num_epoches, running_loss / (batch_size * i),
                running_acc / (batch_size * i)))
    print('Finish {} epoch, Loss: {:.6f}, Acc: {:.6f}'.format(
        epoch + 1, running_loss / (len(train_dataset)), running_acc / (len(
            train_dataset))))
            print('[{}/{}] Loss: {:.6f}, Acc: {:.2f}%'.format(
                epoch + 1, num_epoches, 
                1.0*running_loss / (batch_size * i),
                100.0*running_acc  / (batch_size * i)))
    # do test
    model.eval()
    eval_loss = 0.
    eval_acc = 0.
@@ -94,22 +88,23 @@ for epoch in range(num_epoches):
    for data in test_loader:
        img, label = data
        img = img.view(img.size(0), -1)
        if torch.cuda.is_available():
            img = Variable(img, volatile=True).cuda()
            label = Variable(label, volatile=True).cuda()
        else:
            img = Variable(img, volatile=True)
            label = Variable(label, volatile=True)
        img = Variable(img)
        label = Variable(label)
        out = model(img)
        loss = criterion(out, label)
        eval_loss += loss.data[0] * label.size(0)
        _, pred = torch.max(out, 1)
        num_correct = (pred == label).sum()
        eval_acc += num_correct.data[0]
        pred = out.data.max(1, keepdim=True)[1]
        eval_acc += float(pred.eq(label.data.view_as(pred)).cpu().sum())
    print('Test Loss: {:.6f}, Acc: {:.6f}'.format(eval_loss / (len(
        test_dataset)), eval_acc / (len(test_dataset))))
    print('\nTest Loss: {:.6f}, Acc: {:.2f}%'.format(
        1.0*eval_loss / (len(test_dataset)), 
        100.0*eval_acc  / (len(test_dataset))))
    print()
 # 保存模型
 # save model
 torch.save(model.state_dict(), './model_Neural_Network.pth')
--- a/demo_code/Neural_Network.py
+++ b/demo_code/Neural_Network.py
@@ -1,5 +1,3 @@
 from __future__ import print_function
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -58,7 +56,8 @@ optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5)
 def train(epoch):
    #model.train()
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = Variable(data), Variable(target)
        optimizer.zero_grad()
@@ -66,30 +65,33 @@ def train(epoch):
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % 10 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
        if batch_idx % 100 == 0:
            print("Train epoch: %6d [%6d/%6d (%.0f %%)] \t Loss: %.6f" % (
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.data[0]))
                100. * batch_idx / len(train_loader), loss.data[0]) )
 def test():
    model.eval()
    test_loss = 0
    correct = 0
    test_loss = 0.0
    correct = 0.0
    for data, target in test_loader:
        data, target = Variable(data, volatile=True), Variable(target)
        data, target = Variable(data), Variable(target)
        output = model(data)
        # sum up batch loss
        test_loss += criterion(output, target).data[0]
        # get the index of the max
        pred = output.data.max(1, keepdim=True)[1]
        correct += pred.eq(target.data.view_as(pred)).cpu().sum()
        correct += float(pred.eq(target.data.view_as(pred)).cpu().sum())
    test_loss /= len(test_loader.dataset)
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))
    print("\nTest set: Average loss: %.4f, Accuracy: %6d/%6d (%4.2f %%)\n" %
          (test_loss,
           correct, len(test_loader.dataset),
           100.0*correct / len(test_loader.dataset)) )
 for epoch in range(1, 10):
    train(epoch)
--- a/tips/pytorch/tensor_divide_int.py
+++ b/tips/pytorch/tensor_divide_int.py
@@ -0,0 +1,6 @@
 import torch
 a = torch.tensor([1, 2, 3, 4, 3.5])
 f = 1.0 * a.sum() / 10.0
 print("f = %f" % f)