hummingbird
/
fastNLP

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "# FastNLP 1分钟上手教程"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## step 1\n",
    "读取数据集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "from fastNLP import DataSet\n",
    "# linux_path = \"../test/data_for_tests/tutorial_sample_dataset.csv\"\n",
    "win_path = \"C:\\\\Users\\zyfeng\\Desktop\\FudanNLP\\\\fastNLP\\\\test\\\\data_for_tests\\\\tutorial_sample_dataset.csv\"\n",
    "ds = DataSet.read_csv(win_path, headers=('raw_sentence', 'label'), sep='\\t')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## step 2\n",
    "数据预处理\n",
    "1. 类型转换\n",
    "2. 切分验证集\n",
    "3. 构建词典"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 将所有数字转为小写\n",
    "ds.apply(lambda x: x['raw_sentence'].lower(), new_field_name='raw_sentence')\n",
    "# label转int\n",
    "ds.apply(lambda x: int(x['label']), new_field_name='label_seq', is_target=True)\n",
    "\n",
    "def split_sent(ins):\n",
    "    return ins['raw_sentence'].split()\n",
    "ds.apply(split_sent, new_field_name='words', is_input=True)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train size: "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "54"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Test size: "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "23"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "# 分割训练集/验证集\n",
    "train_data, dev_data = ds.split(0.3)\n",
    "print(\"Train size: \", len(train_data))\n",
    "print(\"Test size: \", len(dev_data))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [],
   "source": [
    "from fastNLP import Vocabulary\n",
    "vocab = Vocabulary(min_freq=2)\n",
    "train_data.apply(lambda x: [vocab.add(word) for word in x['words']])\n",
    "\n",
    "# index句子, Vocabulary.to_index(word)\n",
    "train_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='word_seq', is_input=True)\n",
    "dev_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='word_seq', is_input=True)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## step 3\n",
    " 定义模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [],
   "source": [
    "from fastNLP.models import CNNText\n",
    "model = CNNText(embed_num=len(vocab), embed_dim=50, num_classes=5, padding=2, dropout=0.1)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## step 4\n",
    "开始训练"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training epochs started 2018-12-07 14:03:41"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=6), HTML(value='')), layout=Layout(display='i…"
      ]
     },
     "execution_count": 0,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1/3. Step:2/6. AccuracyMetric: acc=0.26087"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 2/3. Step:4/6. AccuracyMetric: acc=0.347826"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 3/3. Step:6/6. AccuracyMetric: acc=0.608696"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train finished!"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "from fastNLP import Trainer, CrossEntropyLoss, AccuracyMetric\n",
    "trainer = Trainer(model=model, \n",
    "                  train_data=train_data, \n",
    "                  dev_data=dev_data,\n",
    "                  loss=CrossEntropyLoss(),\n",
    "                  metrics=AccuracyMetric()\n",
    "                  )\n",
    "trainer.train()\n",
    "print('Train finished!')\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 本教程结束。更多操作请参考进阶教程。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}