You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

test_data.py 2.2 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556
  1. #!/usr/bin/python
  2. # -*- coding: utf-8 -*-
  3. # __author__="Danqing Wang"
  4. #
  5. # Licensed under the Apache License, Version 2.0 (the "License");
  6. # you may not use this file except in compliance with the License.
  7. # You may obtain a copy of the License at
  8. #
  9. # http://www.apache.org/licenses/LICENSE-2.0
  10. #
  11. # Unless required by applicable law or agreed to in writing, software
  12. # distributed under the License is distributed on an "AS IS" BASIS,
  13. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. # See the License for the specific language governing permissions and
  15. # limitations under the License.
  16. # ==============================================================================
  17. import os
  18. import sys
  19. sys.path.append('/remote-home/dqwang/FastNLP/fastNLP_brxx/')
  20. from fastNLP.core.const import Const
  21. from data.dataloader import SummarizationLoader
  22. from tools.data import ExampleSet, Vocab
  23. vocab_size = 100000
  24. vocab_path = "test/testdata/vocab"
  25. sent_max_len = 100
  26. doc_max_timesteps = 50
  27. # paths = {"train": "test/testdata/train.jsonl", "valid": "test/testdata/val.jsonl"}
  28. paths = {"train": "/remote-home/dqwang/Datasets/CNNDM/train.label.jsonl", "valid": "/remote-home/dqwang/Datasets/CNNDM/val.label.jsonl"}
  29. sum_loader = SummarizationLoader()
  30. dataInfo = sum_loader.process(paths=paths, vocab_size=vocab_size, vocab_path=vocab_path, sent_max_len=sent_max_len, doc_max_timesteps=doc_max_timesteps, load_vocab_file=True)
  31. trainset = dataInfo.datasets["train"]
  32. vocab = Vocab(vocab_path, vocab_size)
  33. dataset = ExampleSet(paths["train"], vocab, doc_max_timesteps, sent_max_len)
  34. # print(trainset[0]["text"])
  35. # print(dataset.get_example(0).original_article_sents)
  36. # print(trainset[0]["words"])
  37. # print(dataset[0][0].numpy().tolist())
  38. b_size = len(trainset)
  39. for i in range(b_size):
  40. if i <= 7327:
  41. continue
  42. print(trainset[i][Const.INPUT])
  43. print(dataset[i][0].numpy().tolist())
  44. assert trainset[i][Const.INPUT] == dataset[i][0].numpy().tolist(), i
  45. assert trainset[i][Const.INPUT_LEN] == dataset[i][2].numpy().tolist(), i
  46. assert trainset[i][Const.TARGET] == dataset[i][1].numpy().tolist(), i