You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

imdb.py 4.3 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110
  1. # coding=utf-8
  2. # Copyright 2020 The TensorFlow Datasets Authors and the HuggingFace Datasets Authors.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. # Lint as: python3
  16. """IMDB movie reviews dataset."""
  17. import datasets
  18. from datasets.tasks import TextClassification
  19. _DESCRIPTION = """\
  20. Large Movie Review Dataset.
  21. This is a dataset for binary sentiment classification containing substantially \
  22. more data than previous benchmark datasets. We provide a set of 25,000 highly \
  23. polar movie reviews for training, and 25,000 for testing. There is additional \
  24. unlabeled data for use as well.\
  25. """
  26. _CITATION = """\
  27. @InProceedings{maas-EtAl:2011:ACL-HLT2011,
  28. author = {Maas, Andrew L. and Daly, Raymond E. and Pham, Peter T. and Huang, Dan and Ng, Andrew Y. and Potts, Christopher},
  29. title = {Learning Word Vectors for Sentiment Analysis},
  30. booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies},
  31. month = {June},
  32. year = {2011},
  33. address = {Portland, Oregon, USA},
  34. publisher = {Association for Computational Linguistics},
  35. pages = {142--150},
  36. url = {http://www.aclweb.org/anthology/P11-1015}
  37. }
  38. """
  39. _DOWNLOAD_URL = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
  40. class IMDBReviewsConfig(datasets.BuilderConfig):
  41. """BuilderConfig for IMDBReviews."""
  42. def __init__(self, **kwargs):
  43. """BuilderConfig for IMDBReviews.
  44. Args:
  45. **kwargs: keyword arguments forwarded to super.
  46. """
  47. super(IMDBReviewsConfig, self).__init__(version=datasets.Version("1.0.0", ""), **kwargs)
  48. class Imdb(datasets.GeneratorBasedBuilder):
  49. """IMDB movie reviews dataset."""
  50. BUILDER_CONFIGS = [
  51. IMDBReviewsConfig(
  52. name="plain_text",
  53. description="Plain text",
  54. )
  55. ]
  56. def _info(self):
  57. return datasets.DatasetInfo(
  58. description=_DESCRIPTION,
  59. features=datasets.Features(
  60. {"text": datasets.Value("string"), "label": datasets.features.ClassLabel(names=["neg", "pos"])}
  61. ),
  62. supervised_keys=None,
  63. homepage="http://ai.stanford.edu/~amaas/data/sentiment/",
  64. citation=_CITATION,
  65. task_templates=[TextClassification(text_column="text", label_column="label")],
  66. )
  67. def _split_generators(self, dl_manager):
  68. archive = dl_manager.download(_DOWNLOAD_URL)
  69. return [
  70. datasets.SplitGenerator(
  71. name=datasets.Split.TRAIN, gen_kwargs={"files": dl_manager.iter_archive(archive), "split": "train"}
  72. ),
  73. datasets.SplitGenerator(
  74. name=datasets.Split.TEST, gen_kwargs={"files": dl_manager.iter_archive(archive), "split": "test"}
  75. ),
  76. datasets.SplitGenerator(
  77. name=datasets.Split("unsupervised"),
  78. gen_kwargs={"files": dl_manager.iter_archive(archive), "split": "train", "labeled": False},
  79. ),
  80. ]
  81. def _generate_examples(self, files, split, labeled=True):
  82. """Generate aclImdb examples."""
  83. # For labeled examples, extract the label from the path.
  84. if labeled:
  85. label_mapping = {"pos": 1, "neg": 0}
  86. for path, f in files:
  87. if path.startswith(f"aclImdb/{split}"):
  88. label = label_mapping.get(path.split("/")[2])
  89. if label is not None:
  90. yield path, {"text": f.read().decode("utf-8"), "label": label}
  91. else:
  92. for path, f in files:
  93. if path.startswith(f"aclImdb/{split}"):
  94. if path.split("/")[2] == "unsup":
  95. yield path, {"text": f.read().decode("utf-8"), "label": -1}