You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

c_api_text_test.cc 197 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989
  1. /**
  2. * Copyright 2020-2021 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include <memory>
  17. #include <string>
  18. #include <vector>
  19. #include "common/common.h"
  20. #include "include/api/status.h"
  21. #include "minddata/dataset/include/dataset/config.h"
  22. #include "minddata/dataset/include/dataset/datasets.h"
  23. #include "minddata/dataset/include/dataset/text.h"
  24. #include "minddata/dataset/include/dataset/transforms.h"
  25. #include "minddata/dataset/text/char_n_gram.h"
  26. #include "minddata/dataset/text/fast_text.h"
  27. #include "minddata/dataset/text/glove.h"
  28. #include "minddata/dataset/text/vectors.h"
  29. #include "minddata/dataset/text/vocab.h"
  30. using namespace mindspore::dataset;
  31. using mindspore::Status;
  32. using mindspore::dataset::CharNGram;
  33. using mindspore::dataset::FastText;
  34. using mindspore::dataset::GloVe;
  35. using mindspore::dataset::ShuffleMode;
  36. using mindspore::dataset::Tensor;
  37. using mindspore::dataset::Vectors;
  38. using mindspore::dataset::Vocab;
  39. class MindDataTestPipeline : public UT::DatasetOpTesting {
  40. protected:
  41. };
  42. TEST_F(MindDataTestPipeline, TestBasicTokenizerSuccess1) {
  43. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBasicTokenizerSuccess1.";
  44. // Test BasicTokenizer with default parameters
  45. // Create a TextFile dataset
  46. std::string data_file = datasets_root_path_ + "/testTokenizerData/basic_tokenizer.txt";
  47. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  48. EXPECT_NE(ds, nullptr);
  49. // Create Take operation on ds
  50. ds = ds->Take(6);
  51. EXPECT_NE(ds, nullptr);
  52. // Create BasicTokenizer operation on ds
  53. std::shared_ptr<TensorTransform> basic_tokenizer = std::make_shared<text::BasicTokenizer>();
  54. EXPECT_NE(basic_tokenizer, nullptr);
  55. // Create Map operation on ds
  56. ds = ds->Map({basic_tokenizer}, {"text"});
  57. EXPECT_NE(ds, nullptr);
  58. // Create an iterator over the result of the above dataset
  59. // This will trigger the creation of the Execution Tree and launch it.
  60. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  61. EXPECT_NE(iter, nullptr);
  62. // Iterate the dataset and get each row
  63. std::unordered_map<std::string, mindspore::MSTensor> row;
  64. ASSERT_OK(iter->GetNextRow(&row));
  65. std::vector<std::vector<std::string>> expected = {
  66. {"Welcome", "to", "Beijing", "北", "京", "欢", "迎", "您"},
  67. {"長", "風", "破", "浪", "會", "有", "時", ",", "直", "掛", "雲", "帆", "濟", "滄", "海"},
  68. {"😀", "嘿", "嘿", "😃", "哈", "哈", "😄", "大", "笑", "😁", "嘻", "嘻"},
  69. {"明", "朝", "(", "1368", "—", "1644", "年", ")", "和", "清", "朝", "(", "1644", "—", "1911", "年", ")",
  70. ",", "是", "中", "国", "封", "建", "王", "朝", "史", "上", "最", "后", "两", "个", "朝", "代"},
  71. {"明", "代", "(", "1368", "-", "1644", ")", "と", "清", "代", "(", "1644",
  72. "-", "1911", ")", "は", "、", "中", "国", "の", "封", "建", "王", "朝",
  73. "の", "歴", "史", "における", "最", "後", "の2つの", "王", "朝", "でした"},
  74. {"명나라", "(", "1368", "-", "1644", ")", "와", "청나라", "(", "1644", "-",
  75. "1911", ")", "는", "중국", "봉건", "왕조의", "역사에서", "마지막", "두", "왕조였다"}};
  76. uint64_t i = 0;
  77. while (row.size() != 0) {
  78. auto ind = row["text"];
  79. std::shared_ptr<Tensor> de_expected_tensor;
  80. ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
  81. mindspore::MSTensor expected_tensor =
  82. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  83. EXPECT_MSTENSOR_EQ(ind, expected_tensor);
  84. ASSERT_OK(iter->GetNextRow(&row));
  85. i++;
  86. }
  87. EXPECT_EQ(i, 6);
  88. // Manually terminate the pipeline
  89. iter->Stop();
  90. }
  91. TEST_F(MindDataTestPipeline, TestBasicTokenizerSuccess2) {
  92. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBasicTokenizerSuccess2.";
  93. // Test BasicTokenizer with lower_case true
  94. // Create a TextFile dataset
  95. std::string data_file = datasets_root_path_ + "/testTokenizerData/basic_tokenizer.txt";
  96. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  97. EXPECT_NE(ds, nullptr);
  98. // Create Skip operation on ds
  99. ds = ds->Skip(6);
  100. EXPECT_NE(ds, nullptr);
  101. // Create BasicTokenizer operation on ds
  102. std::shared_ptr<TensorTransform> basic_tokenizer = std::make_shared<text::BasicTokenizer>(true);
  103. EXPECT_NE(basic_tokenizer, nullptr);
  104. // Create Map operation on ds
  105. ds = ds->Map({basic_tokenizer}, {"text"});
  106. EXPECT_NE(ds, nullptr);
  107. // Create an iterator over the result of the above dataset
  108. // This will trigger the creation of the Execution Tree and launch it.
  109. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  110. EXPECT_NE(iter, nullptr);
  111. // Iterate the dataset and get each row
  112. std::unordered_map<std::string, mindspore::MSTensor> row;
  113. ASSERT_OK(iter->GetNextRow(&row));
  114. std::vector<std::string> expected = {"this", "is", "a", "funky", "string"};
  115. std::shared_ptr<Tensor> de_expected_tensor;
  116. ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
  117. mindspore::MSTensor expected_tensor =
  118. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  119. uint64_t i = 0;
  120. while (row.size() != 0) {
  121. auto ind = row["text"];
  122. EXPECT_MSTENSOR_EQ(ind, expected_tensor);
  123. ASSERT_OK(iter->GetNextRow(&row));
  124. i++;
  125. }
  126. EXPECT_EQ(i, 1);
  127. // Manually terminate the pipeline
  128. iter->Stop();
  129. }
  130. TEST_F(MindDataTestPipeline, TestBasicTokenizerSuccess3) {
  131. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBasicTokenizerSuccess3.";
  132. // Test BasicTokenizer with with_offsets true and lower_case true
  133. // Create a TextFile dataset
  134. std::string data_file = datasets_root_path_ + "/testTokenizerData/basic_tokenizer.txt";
  135. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  136. EXPECT_NE(ds, nullptr);
  137. // Create Skip operation on ds
  138. ds = ds->Skip(6);
  139. EXPECT_NE(ds, nullptr);
  140. // Create BasicTokenizer operation on ds
  141. std::shared_ptr<TensorTransform> basic_tokenizer =
  142. std::make_shared<text::BasicTokenizer>(true, false, NormalizeForm::kNone, true, true);
  143. EXPECT_NE(basic_tokenizer, nullptr);
  144. // Create Map operation on ds
  145. ds = ds->Map({basic_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"});
  146. EXPECT_NE(ds, nullptr);
  147. // Create an iterator over the result of the above dataset
  148. // This will trigger the creation of the Execution Tree and launch it.
  149. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  150. EXPECT_NE(iter, nullptr);
  151. // Iterate the dataset and get each row
  152. std::unordered_map<std::string, mindspore::MSTensor> row;
  153. ASSERT_OK(iter->GetNextRow(&row));
  154. std::vector<std::string> expected_tokens = {"this", "is", "a", "funky", "string"};
  155. std::vector<uint32_t> expected_offsets_start = {0, 5, 8, 10, 16};
  156. std::vector<uint32_t> expected_offsets_limit = {4, 7, 9, 15, 22};
  157. std::shared_ptr<Tensor> de_expected_tokens;
  158. ASSERT_OK(Tensor::CreateFromVector(expected_tokens, &de_expected_tokens));
  159. mindspore::MSTensor ms_expected_tokens =
  160. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tokens));
  161. std::shared_ptr<Tensor> de_expected_offsets_start;
  162. ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start, &de_expected_offsets_start));
  163. mindspore::MSTensor ms_expected_offsets_start =
  164. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_start));
  165. std::shared_ptr<Tensor> de_expected_offsets_limit;
  166. ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit, &de_expected_offsets_limit));
  167. mindspore::MSTensor ms_expected_offsets_limit =
  168. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_limit));
  169. uint64_t i = 0;
  170. while (row.size() != 0) {
  171. auto ind = row["token"];
  172. EXPECT_MSTENSOR_EQ(ind, ms_expected_tokens);
  173. auto start = row["offsets_start"];
  174. EXPECT_MSTENSOR_EQ(start, ms_expected_offsets_start);
  175. auto limit = row["offsets_limit"];
  176. EXPECT_MSTENSOR_EQ(limit, ms_expected_offsets_limit);
  177. ASSERT_OK(iter->GetNextRow(&row));
  178. i++;
  179. }
  180. EXPECT_EQ(i, 1);
  181. // Manually terminate the pipeline
  182. iter->Stop();
  183. }
  184. std::vector<std::string> list = {
  185. "床", "前", "明", "月", "光", "疑", "是", "地", "上", "霜", "举", "头",
  186. "望", "低", "思", "故", "乡", "繁", "體", "字", "嘿", "哈", "大", "笑",
  187. "嘻", "i", "am", "mak", "make", "small", "mistake", "##s", "during", "work", "##ing", "hour",
  188. "😀", "😃", "😄", "😁", "+", "/", "-", "=", "12", "28", "40", "16",
  189. " ", "I", "[CLS]", "[SEP]", "[UNK]", "[PAD]", "[MASK]", "[unused1]", "[unused10]"};
  190. TEST_F(MindDataTestPipeline, TestBertTokenizerSuccess1) {
  191. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerSuccess1.";
  192. // Test BertTokenizer with default parameters
  193. // Create a TextFile dataset
  194. std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
  195. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  196. EXPECT_NE(ds, nullptr);
  197. // Create Take operation on ds
  198. ds = ds->Take(4);
  199. EXPECT_NE(ds, nullptr);
  200. // Create a vocab from vector
  201. std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
  202. Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
  203. EXPECT_EQ(s, Status::OK());
  204. // Create BertTokenizer operation on ds
  205. std::shared_ptr<TensorTransform> bert_tokenizer = std::make_shared<text::BertTokenizer>(vocab);
  206. EXPECT_NE(bert_tokenizer, nullptr);
  207. // Create Map operation on ds
  208. ds = ds->Map({bert_tokenizer}, {"text"});
  209. EXPECT_NE(ds, nullptr);
  210. // Create an iterator over the result of the above dataset
  211. // This will trigger the creation of the Execution Tree and launch it.
  212. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  213. EXPECT_NE(iter, nullptr);
  214. // Iterate the dataset and get each row
  215. std::unordered_map<std::string, mindspore::MSTensor> row;
  216. ASSERT_OK(iter->GetNextRow(&row));
  217. std::vector<std::vector<std::string>> expected = {{"床", "前", "明", "月", "光"},
  218. {"疑", "是", "地", "上", "霜"},
  219. {"举", "头", "望", "明", "月"},
  220. {"低", "头", "思", "故", "乡"}};
  221. uint64_t i = 0;
  222. while (row.size() != 0) {
  223. auto ind = row["text"];
  224. std::shared_ptr<Tensor> de_expected_tensor;
  225. ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
  226. mindspore::MSTensor expected_tensor =
  227. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  228. EXPECT_MSTENSOR_EQ(ind, expected_tensor);
  229. ASSERT_OK(iter->GetNextRow(&row));
  230. i++;
  231. }
  232. EXPECT_EQ(i, 4);
  233. // Manually terminate the pipeline
  234. iter->Stop();
  235. }
  236. TEST_F(MindDataTestPipeline, TestBertTokenizerSuccess2) {
  237. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerSuccess2.";
  238. // Test BertTokenizer with lower_case true
  239. // Create a TextFile dataset
  240. std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
  241. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  242. EXPECT_NE(ds, nullptr);
  243. // Create Skip operation on ds
  244. ds = ds->Skip(4);
  245. EXPECT_NE(ds, nullptr);
  246. // Create Take operation on ds
  247. ds = ds->Take(1);
  248. EXPECT_NE(ds, nullptr);
  249. // Create a vocab from vector
  250. std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
  251. Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
  252. EXPECT_EQ(s, Status::OK());
  253. // Create BertTokenizer operation on ds
  254. std::shared_ptr<TensorTransform> bert_tokenizer =
  255. std::make_shared<text::BertTokenizer>(vocab, "##", 100, "[UNK]", true);
  256. EXPECT_NE(bert_tokenizer, nullptr);
  257. // Create Map operation on ds
  258. ds = ds->Map({bert_tokenizer}, {"text"});
  259. EXPECT_NE(ds, nullptr);
  260. // Create an iterator over the result of the above dataset
  261. // This will trigger the creation of the Execution Tree and launch it.
  262. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  263. EXPECT_NE(iter, nullptr);
  264. // Iterate the dataset and get each row
  265. std::unordered_map<std::string, mindspore::MSTensor> row;
  266. ASSERT_OK(iter->GetNextRow(&row));
  267. std::vector<std::string> expected = {"i", "am", "mak", "##ing", "small", "mistake",
  268. "##s", "during", "work", "##ing", "hour", "##s"};
  269. std::shared_ptr<Tensor> de_expected_tensor;
  270. ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
  271. mindspore::MSTensor expected_tensor =
  272. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  273. uint64_t i = 0;
  274. while (row.size() != 0) {
  275. auto ind = row["text"];
  276. EXPECT_MSTENSOR_EQ(ind, expected_tensor);
  277. ASSERT_OK(iter->GetNextRow(&row));
  278. i++;
  279. }
  280. EXPECT_EQ(i, 1);
  281. // Manually terminate the pipeline
  282. iter->Stop();
  283. }
  284. TEST_F(MindDataTestPipeline, TestBertTokenizerSuccess3) {
  285. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerSuccess3.";
  286. // Test BertTokenizer with normalization_form NFKC
  287. // Create a TextFile dataset
  288. std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
  289. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  290. EXPECT_NE(ds, nullptr);
  291. // Create Skip operation on ds
  292. ds = ds->Skip(5);
  293. EXPECT_NE(ds, nullptr);
  294. // Create Take operation on ds
  295. ds = ds->Take(2);
  296. EXPECT_NE(ds, nullptr);
  297. // Create a vocab from vector
  298. std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
  299. Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
  300. EXPECT_EQ(s, Status::OK());
  301. // Create BertTokenizer operation on ds
  302. std::shared_ptr<TensorTransform> bert_tokenizer =
  303. std::make_shared<text::BertTokenizer>(vocab, "##", 100, "[UNK]", false, false, NormalizeForm::kNfc);
  304. EXPECT_NE(bert_tokenizer, nullptr);
  305. // Create Map operation on ds
  306. ds = ds->Map({bert_tokenizer}, {"text"});
  307. EXPECT_NE(ds, nullptr);
  308. // Create an iterator over the result of the above dataset
  309. // This will trigger the creation of the Execution Tree and launch it.
  310. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  311. EXPECT_NE(iter, nullptr);
  312. // Iterate the dataset and get each row
  313. std::unordered_map<std::string, mindspore::MSTensor> row;
  314. ASSERT_OK(iter->GetNextRow(&row));
  315. std::vector<std::vector<std::string>> expected = {
  316. {"😀", "嘿", "嘿", "😃", "哈", "哈", "😄", "大", "笑", "😁", "嘻", "嘻"}, {"繁", "體", "字"}};
  317. uint64_t i = 0;
  318. while (row.size() != 0) {
  319. auto ind = row["text"];
  320. std::shared_ptr<Tensor> de_expected_tensor;
  321. ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
  322. mindspore::MSTensor expected_tensor =
  323. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  324. EXPECT_MSTENSOR_EQ(ind, expected_tensor);
  325. ASSERT_OK(iter->GetNextRow(&row));
  326. i++;
  327. }
  328. EXPECT_EQ(i, 2);
  329. // Manually terminate the pipeline
  330. iter->Stop();
  331. }
  332. TEST_F(MindDataTestPipeline, TestBertTokenizerSuccess4) {
  333. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerSuccess4.";
  334. // Test BertTokenizer with keep_whitespace true
  335. // Create a TextFile dataset
  336. std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
  337. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  338. EXPECT_NE(ds, nullptr);
  339. // Create Skip operation on ds
  340. ds = ds->Skip(7);
  341. EXPECT_NE(ds, nullptr);
  342. // Create Take operation on ds
  343. ds = ds->Take(1);
  344. EXPECT_NE(ds, nullptr);
  345. // Create a vocab from vector
  346. std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
  347. Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
  348. EXPECT_EQ(s, Status::OK());
  349. // Create BertTokenizer operation on ds
  350. std::shared_ptr<TensorTransform> bert_tokenizer =
  351. std::make_shared<text::BertTokenizer>(vocab, "##", 100, "[UNK]", false, true);
  352. EXPECT_NE(bert_tokenizer, nullptr);
  353. // Create Map operation on ds
  354. ds = ds->Map({bert_tokenizer}, {"text"});
  355. EXPECT_NE(ds, nullptr);
  356. // Create an iterator over the result of the above dataset
  357. // This will trigger the creation of the Execution Tree and launch it.
  358. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  359. EXPECT_NE(iter, nullptr);
  360. // Iterate the dataset and get each row
  361. std::unordered_map<std::string, mindspore::MSTensor> row;
  362. ASSERT_OK(iter->GetNextRow(&row));
  363. std::vector<std::string> expected = {"[UNK]", " ", "[CLS]"};
  364. std::shared_ptr<Tensor> de_expected_tensor;
  365. ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
  366. mindspore::MSTensor expected_tensor =
  367. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  368. uint64_t i = 0;
  369. while (row.size() != 0) {
  370. auto ind = row["text"];
  371. EXPECT_MSTENSOR_EQ(ind, expected_tensor);
  372. ASSERT_OK(iter->GetNextRow(&row));
  373. i++;
  374. }
  375. EXPECT_EQ(i, 1);
  376. // Manually terminate the pipeline
  377. iter->Stop();
  378. }
  379. TEST_F(MindDataTestPipeline, TestBertTokenizerSuccess5) {
  380. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerSuccess5.";
  381. // Test BertTokenizer with unknown_token empty and keep_whitespace true
  382. // Create a TextFile dataset
  383. std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
  384. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  385. EXPECT_NE(ds, nullptr);
  386. // Create Skip operation on ds
  387. ds = ds->Skip(7);
  388. EXPECT_NE(ds, nullptr);
  389. // Create Take operation on ds
  390. ds = ds->Take(1);
  391. EXPECT_NE(ds, nullptr);
  392. // Create a vocab from vector
  393. std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
  394. Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
  395. EXPECT_EQ(s, Status::OK());
  396. // Create BertTokenizer operation on ds
  397. std::shared_ptr<TensorTransform> bert_tokenizer =
  398. std::make_shared<text::BertTokenizer>(vocab, "##", 100, "", false, true);
  399. EXPECT_NE(bert_tokenizer, nullptr);
  400. // Create Map operation on ds
  401. ds = ds->Map({bert_tokenizer}, {"text"});
  402. EXPECT_NE(ds, nullptr);
  403. // Create an iterator over the result of the above dataset
  404. // This will trigger the creation of the Execution Tree and launch it.
  405. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  406. EXPECT_NE(iter, nullptr);
  407. // Iterate the dataset and get each row
  408. std::unordered_map<std::string, mindspore::MSTensor> row;
  409. ASSERT_OK(iter->GetNextRow(&row));
  410. std::vector<std::string> expected = {"unused", " ", "[CLS]"};
  411. std::shared_ptr<Tensor> de_expected_tensor;
  412. ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
  413. mindspore::MSTensor expected_tensor =
  414. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  415. uint64_t i = 0;
  416. while (row.size() != 0) {
  417. auto ind = row["text"];
  418. EXPECT_MSTENSOR_EQ(ind, expected_tensor);
  419. ASSERT_OK(iter->GetNextRow(&row));
  420. i++;
  421. }
  422. EXPECT_EQ(i, 1);
  423. // Manually terminate the pipeline
  424. iter->Stop();
  425. }
  426. TEST_F(MindDataTestPipeline, TestBertTokenizerSuccess6) {
  427. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerSuccess6.";
  428. // Test BertTokenizer with preserve_unused_token false, unknown_token empty and keep_whitespace true
  429. // Create a TextFile dataset
  430. std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
  431. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  432. EXPECT_NE(ds, nullptr);
  433. // Create Skip operation on ds
  434. ds = ds->Skip(7);
  435. EXPECT_NE(ds, nullptr);
  436. // Create Take operation on ds
  437. ds = ds->Take(1);
  438. EXPECT_NE(ds, nullptr);
  439. // Create a vocab from vector
  440. std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
  441. Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
  442. EXPECT_EQ(s, Status::OK());
  443. // Create BertTokenizer operation on ds
  444. std::shared_ptr<TensorTransform> bert_tokenizer =
  445. std::make_shared<text::BertTokenizer>(vocab, "##", 100, "", false, true, NormalizeForm::kNone, false);
  446. EXPECT_NE(bert_tokenizer, nullptr);
  447. // Create Map operation on ds
  448. ds = ds->Map({bert_tokenizer}, {"text"});
  449. EXPECT_NE(ds, nullptr);
  450. // Create an iterator over the result of the above dataset
  451. // This will trigger the creation of the Execution Tree and launch it.
  452. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  453. EXPECT_NE(iter, nullptr);
  454. // Iterate the dataset and get each row
  455. std::unordered_map<std::string, mindspore::MSTensor> row;
  456. ASSERT_OK(iter->GetNextRow(&row));
  457. std::vector<std::string> expected = {"unused", " ", "[", "CLS", "]"};
  458. std::shared_ptr<Tensor> de_expected_tensor;
  459. ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
  460. mindspore::MSTensor expected_tensor =
  461. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  462. uint64_t i = 0;
  463. while (row.size() != 0) {
  464. auto ind = row["text"];
  465. EXPECT_MSTENSOR_EQ(ind, expected_tensor);
  466. ASSERT_OK(iter->GetNextRow(&row));
  467. i++;
  468. }
  469. EXPECT_EQ(i, 1);
  470. // Manually terminate the pipeline
  471. iter->Stop();
  472. }
  473. TEST_F(MindDataTestPipeline, TestBertTokenizerSuccess7) {
  474. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerSuccess7.";
  475. // Test BertTokenizer with with_offsets true and lower_case true
  476. // Create a TextFile dataset
  477. std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
  478. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  479. EXPECT_NE(ds, nullptr);
  480. // Create Skip operation on ds
  481. ds = ds->Skip(4);
  482. EXPECT_NE(ds, nullptr);
  483. // Create Take operation on ds
  484. ds = ds->Take(1);
  485. EXPECT_NE(ds, nullptr);
  486. // Create a vocab from vector
  487. std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
  488. Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
  489. EXPECT_EQ(s, Status::OK());
  490. // Create BertTokenizer operation on ds
  491. std::shared_ptr<TensorTransform> bert_tokenizer =
  492. std::make_shared<text::BertTokenizer>(vocab, "##", 100, "[UNK]", true, false, NormalizeForm::kNone, true, true);
  493. EXPECT_NE(bert_tokenizer, nullptr);
  494. // Create Map operation on ds
  495. ds = ds->Map({bert_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"});
  496. EXPECT_NE(ds, nullptr);
  497. // Create an iterator over the result of the above dataset
  498. // This will trigger the creation of the Execution Tree and launch it.
  499. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  500. EXPECT_NE(iter, nullptr);
  501. // Iterate the dataset and get each row
  502. std::unordered_map<std::string, mindspore::MSTensor> row;
  503. ASSERT_OK(iter->GetNextRow(&row));
  504. std::vector<std::string> expected_tokens = {"i", "am", "mak", "##ing", "small", "mistake",
  505. "##s", "during", "work", "##ing", "hour", "##s"};
  506. std::vector<uint32_t> expected_offsets_start = {0, 2, 5, 8, 12, 18, 25, 27, 34, 38, 42, 46};
  507. std::vector<uint32_t> expected_offsets_limit = {1, 4, 8, 11, 17, 25, 26, 33, 38, 41, 46, 47};
  508. std::shared_ptr<Tensor> de_expected_tokens;
  509. ASSERT_OK(Tensor::CreateFromVector(expected_tokens, &de_expected_tokens));
  510. mindspore::MSTensor ms_expected_tokens =
  511. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tokens));
  512. std::shared_ptr<Tensor> de_expected_offsets_start;
  513. ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start, &de_expected_offsets_start));
  514. mindspore::MSTensor ms_expected_offsets_start =
  515. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_start));
  516. std::shared_ptr<Tensor> de_expected_offsets_limit;
  517. ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit, &de_expected_offsets_limit));
  518. mindspore::MSTensor ms_expected_offsets_limit =
  519. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_limit));
  520. uint64_t i = 0;
  521. while (row.size() != 0) {
  522. auto ind = row["token"];
  523. EXPECT_MSTENSOR_EQ(ind, ms_expected_tokens);
  524. auto start = row["offsets_start"];
  525. EXPECT_MSTENSOR_EQ(start, ms_expected_offsets_start);
  526. auto limit = row["offsets_limit"];
  527. EXPECT_MSTENSOR_EQ(limit, ms_expected_offsets_limit);
  528. ASSERT_OK(iter->GetNextRow(&row));
  529. i++;
  530. }
  531. EXPECT_EQ(i, 1);
  532. // Manually terminate the pipeline
  533. iter->Stop();
  534. }
  535. TEST_F(MindDataTestPipeline, TestBertTokenizerFail1) {
  536. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerFail1.";
  537. // Test BertTokenizer with nullptr vocab
  538. // Create a TextFile dataset
  539. std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
  540. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  541. EXPECT_NE(ds, nullptr);
  542. // Create BertTokenizer operation on ds
  543. std::shared_ptr<TensorTransform> bert_tokenizer = std::make_shared<text::BertTokenizer>(nullptr);
  544. EXPECT_NE(bert_tokenizer, nullptr);
  545. // Create a Map operation on ds
  546. ds = ds->Map({bert_tokenizer});
  547. EXPECT_NE(ds, nullptr);
  548. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  549. // Expect failure: invalid BertTokenizer input with nullptr vocab
  550. EXPECT_EQ(iter, nullptr);
  551. }
  552. TEST_F(MindDataTestPipeline, TestBertTokenizerFail2) {
  553. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerFail2.";
  554. // Test BertTokenizer with negative max_bytes_per_token
  555. // Create a TextFile dataset
  556. std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
  557. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  558. EXPECT_NE(ds, nullptr);
  559. // Create a vocab from vector
  560. std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
  561. Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
  562. EXPECT_EQ(s, Status::OK());
  563. // Create BertTokenizer operation on ds
  564. std::shared_ptr<TensorTransform> bert_tokenizer = std::make_shared<text::BertTokenizer>(vocab, "##", -1);
  565. EXPECT_NE(bert_tokenizer, nullptr);
  566. // Create a Map operation on ds
  567. ds = ds->Map({bert_tokenizer});
  568. EXPECT_NE(ds, nullptr);
  569. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  570. // Expect failure: invalid BertTokenizer input with nullptr vocab
  571. EXPECT_EQ(iter, nullptr);
  572. }
  573. TEST_F(MindDataTestPipeline, TestCaseFoldSuccess) {
  574. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCaseFoldSuccess.";
  575. // Create a TextFile dataset
  576. std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt";
  577. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  578. EXPECT_NE(ds, nullptr);
  579. // Create casefold operation on ds
  580. std::shared_ptr<TensorTransform> casefold = std::make_shared<text::CaseFold>();
  581. EXPECT_NE(casefold, nullptr);
  582. // Create Map operation on ds
  583. ds = ds->Map({casefold}, {"text"});
  584. EXPECT_NE(ds, nullptr);
  585. // Create an iterator over the result of the above dataset
  586. // This will trigger the creation of the Execution Tree and launch it.
  587. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  588. EXPECT_NE(iter, nullptr);
  589. // Iterate the dataset and get each row
  590. std::unordered_map<std::string, mindspore::MSTensor> row;
  591. ASSERT_OK(iter->GetNextRow(&row));
  592. std::vector<std::string> expected = {"welcome to beijing!", "北京欢迎您!", "我喜欢english!", " "};
  593. uint64_t i = 0;
  594. while (row.size() != 0) {
  595. auto ind = row["text"];
  596. std::shared_ptr<Tensor> de_expected_tensor;
  597. ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor));
  598. mindspore::MSTensor ms_expected_tensor =
  599. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  600. EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor);
  601. ASSERT_OK(iter->GetNextRow(&row));
  602. i++;
  603. }
  604. EXPECT_EQ(i, 4);
  605. // Manually terminate the pipeline
  606. iter->Stop();
  607. }
  608. TEST_F(MindDataTestPipeline, TestJiebaTokenizerSuccess) {
  609. // Testing the parameter of JiebaTokenizer interface when the mode is JiebaMode::kMp and the with_offsets is false.
  610. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerSuccess.";
  611. // Create a TextFile dataset
  612. std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
  613. std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
  614. std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
  615. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  616. EXPECT_NE(ds, nullptr);
  617. // Create jieba_tokenizer operation on ds
  618. std::shared_ptr<TensorTransform> jieba_tokenizer =
  619. std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp);
  620. EXPECT_NE(jieba_tokenizer, nullptr);
  621. // Create Map operation on ds
  622. ds = ds->Map({jieba_tokenizer}, {"text"});
  623. EXPECT_NE(ds, nullptr);
  624. // Create an iterator over the result of the above dataset
  625. // This will trigger the creation of the Execution Tree and launch it.
  626. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  627. EXPECT_NE(iter, nullptr);
  628. // Iterate the dataset and get each row
  629. std::unordered_map<std::string, mindspore::MSTensor> row;
  630. ASSERT_OK(iter->GetNextRow(&row));
  631. std::vector<std::string> expected = {"今天天气", "太好了", "我们", "一起", "去", "外面", "玩吧"};
  632. std::shared_ptr<Tensor> de_expected_tensor;
  633. ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
  634. mindspore::MSTensor expected_tensor =
  635. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  636. uint64_t i = 0;
  637. while (row.size() != 0) {
  638. auto ind = row["text"];
  639. EXPECT_MSTENSOR_EQ(ind, expected_tensor);
  640. ASSERT_OK(iter->GetNextRow(&row));
  641. i++;
  642. }
  643. EXPECT_EQ(i, 1);
  644. // Manually terminate the pipeline
  645. iter->Stop();
  646. }
  647. TEST_F(MindDataTestPipeline, TestJiebaTokenizerSuccess1) {
  648. // Testing the parameter of JiebaTokenizer interface when the mode is JiebaMode::kHmm and the with_offsets is false.
  649. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerSuccess1.";
  650. // Create a TextFile dataset
  651. std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
  652. std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
  653. std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
  654. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  655. EXPECT_NE(ds, nullptr);
  656. // Create jieba_tokenizer operation on ds
  657. std::shared_ptr<TensorTransform> jieba_tokenizer =
  658. std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kHmm);
  659. EXPECT_NE(jieba_tokenizer, nullptr);
  660. // Create Map operation on ds
  661. ds = ds->Map({jieba_tokenizer}, {"text"});
  662. EXPECT_NE(ds, nullptr);
  663. // Create an iterator over the result of the above dataset
  664. // This will trigger the creation of the Execution Tree and launch it.
  665. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  666. EXPECT_NE(iter, nullptr);
  667. // Iterate the dataset and get each row
  668. std::unordered_map<std::string, mindspore::MSTensor> row;
  669. ASSERT_OK(iter->GetNextRow(&row));
  670. std::vector<std::string> expected = {"今天", "天气", "太", "好", "了", "我们", "一起", "去", "外面", "玩", "吧"};
  671. std::shared_ptr<Tensor> de_expected_tensor;
  672. ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
  673. mindspore::MSTensor expected_tensor =
  674. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  675. uint64_t i = 0;
  676. while (row.size() != 0) {
  677. auto ind = row["text"];
  678. EXPECT_MSTENSOR_EQ(ind, expected_tensor);
  679. ASSERT_OK(iter->GetNextRow(&row));
  680. i++;
  681. }
  682. EXPECT_EQ(i, 1);
  683. // Manually terminate the pipeline
  684. iter->Stop();
  685. }
  686. TEST_F(MindDataTestPipeline, TestJiebaTokenizerSuccess2) {
  687. // Testing the parameter of JiebaTokenizer interface when the mode is JiebaMode::kMp and the with_offsets is true.
  688. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerSuccess2.";
  689. // Create a TextFile dataset
  690. std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
  691. std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
  692. std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
  693. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  694. EXPECT_NE(ds, nullptr);
  695. // Create jieba_tokenizer operation on ds
  696. std::shared_ptr<TensorTransform> jieba_tokenizer =
  697. std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp, true);
  698. EXPECT_NE(jieba_tokenizer, nullptr);
  699. // Create Map operation on ds
  700. ds = ds->Map({jieba_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"},
  701. {"token", "offsets_start", "offsets_limit"});
  702. EXPECT_NE(ds, nullptr);
  703. // Create an iterator over the result of the above dataset
  704. // This will trigger the creation of the Execution Tree and launch it.
  705. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  706. EXPECT_NE(iter, nullptr);
  707. // Iterate the dataset and get each row
  708. std::unordered_map<std::string, mindspore::MSTensor> row;
  709. ASSERT_OK(iter->GetNextRow(&row));
  710. std::vector<std::string> expected_tokens = {"今天天气", "太好了", "我们", "一起", "去", "外面", "玩吧"};
  711. std::vector<uint32_t> expected_offsets_start = {0, 12, 21, 27, 33, 36, 42};
  712. std::vector<uint32_t> expected_offsets_limit = {12, 21, 27, 33, 36, 42, 48};
  713. std::shared_ptr<Tensor> de_expected_tokens;
  714. ASSERT_OK(Tensor::CreateFromVector(expected_tokens, &de_expected_tokens));
  715. mindspore::MSTensor ms_expected_tokens =
  716. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tokens));
  717. std::shared_ptr<Tensor> de_expected_offsets_start;
  718. ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start, &de_expected_offsets_start));
  719. mindspore::MSTensor ms_expected_offsets_start =
  720. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_start));
  721. std::shared_ptr<Tensor> de_expected_offsets_limit;
  722. ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit, &de_expected_offsets_limit));
  723. mindspore::MSTensor ms_expected_offsets_limit =
  724. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_limit));
  725. uint64_t i = 0;
  726. while (row.size() != 0) {
  727. auto ind = row["token"];
  728. EXPECT_MSTENSOR_EQ(ind, ms_expected_tokens);
  729. auto start = row["offsets_start"];
  730. EXPECT_MSTENSOR_EQ(start, ms_expected_offsets_start);
  731. auto limit = row["offsets_limit"];
  732. EXPECT_MSTENSOR_EQ(limit, ms_expected_offsets_limit);
  733. ASSERT_OK(iter->GetNextRow(&row));
  734. i++;
  735. }
  736. EXPECT_EQ(i, 1);
  737. // Manually terminate the pipeline
  738. iter->Stop();
  739. }
  740. TEST_F(MindDataTestPipeline, TestJiebaTokenizerFail1) {
  741. // Testing the incorrect parameter of JiebaTokenizer interface.
  742. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerFail1.";
  743. // Create a TextFile dataset
  744. std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
  745. std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
  746. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  747. EXPECT_NE(ds, nullptr);
  748. // Create jieba_tokenizer operation on ds
  749. // Testing the parameter hmm_path is empty
  750. std::shared_ptr<TensorTransform> jieba_tokenizer =
  751. std::make_shared<text::JiebaTokenizer>("", mp_path, JiebaMode::kMp);
  752. EXPECT_NE(jieba_tokenizer, nullptr);
  753. // Create a Map operation on ds
  754. ds = ds->Map({jieba_tokenizer});
  755. EXPECT_NE(ds, nullptr);
  756. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  757. // Expect failure: invalid JiebaTokenizer input (parameter hmm_path is empty)
  758. EXPECT_EQ(iter, nullptr);
  759. }
  760. TEST_F(MindDataTestPipeline, TestJiebaTokenizerFail2) {
  761. // Testing the incorrect parameter of JiebaTokenizer interface.
  762. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerFail2.";
  763. // Create a TextFile dataset
  764. std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
  765. std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
  766. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  767. EXPECT_NE(ds, nullptr);
  768. // Create jieba_tokenizer operation on ds
  769. // Testing the parameter mp_path is empty
  770. std::shared_ptr<TensorTransform> jieba_tokenizer =
  771. std::make_shared<text::JiebaTokenizer>(hmm_path, "", JiebaMode::kMp);
  772. EXPECT_NE(jieba_tokenizer, nullptr);
  773. // Create a Map operation on ds
  774. ds = ds->Map({jieba_tokenizer});
  775. EXPECT_NE(ds, nullptr);
  776. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  777. // Expect failure: invalid JiebaTokenizer input (parameter mp_path is empty)
  778. EXPECT_EQ(iter, nullptr);
  779. }
  780. TEST_F(MindDataTestPipeline, TestJiebaTokenizerFail3) {
  781. // Testing the incorrect parameter of JiebaTokenizer interface.
  782. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerFail3.";
  783. // Create a TextFile dataset
  784. std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
  785. std::string hmm_path_invalid = datasets_root_path_ + "/jiebadict/1.txt";
  786. std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
  787. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  788. EXPECT_NE(ds, nullptr);
  789. // Create jieba_tokenizer operation on ds
  790. // Testing the parameter hmm_path is invalid path
  791. std::shared_ptr<TensorTransform> jieba_tokenizer =
  792. std::make_shared<text::JiebaTokenizer>(hmm_path_invalid, mp_path, JiebaMode::kMp);
  793. EXPECT_NE(jieba_tokenizer, nullptr);
  794. // Create a Map operation on ds
  795. ds = ds->Map({jieba_tokenizer});
  796. EXPECT_NE(ds, nullptr);
  797. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  798. // Expect failure: invalid JiebaTokenizer input (parameter hmm_path is invalid path)
  799. EXPECT_EQ(iter, nullptr);
  800. }
  801. TEST_F(MindDataTestPipeline, TestJiebaTokenizerFail4) {
  802. // Testing the incorrect parameter of JiebaTokenizer interface.
  803. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerFail4.";
  804. // Create a TextFile dataset
  805. std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
  806. std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
  807. std::string mp_path_invalid = datasets_root_path_ + "/jiebadict/1.txt";
  808. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  809. EXPECT_NE(ds, nullptr);
  810. // Create jieba_tokenizer operation on ds
  811. // Testing the parameter mp_path is invalid path
  812. std::shared_ptr<TensorTransform> jieba_tokenizer =
  813. std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path_invalid, JiebaMode::kMp);
  814. EXPECT_NE(jieba_tokenizer, nullptr);
  815. // Create a Map operation on ds
  816. ds = ds->Map({jieba_tokenizer});
  817. EXPECT_NE(ds, nullptr);
  818. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  819. // Expect failure: invalid JiebaTokenizer input (parameter mp_path is invalid path)
  820. EXPECT_EQ(iter, nullptr);
  821. }
  822. TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWord) {
  823. // Testing the parameter AddWord of JiebaTokenizer when the freq is not provided (default 0).
  824. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddWord.";
  825. // Create a TextFile dataset
  826. std::string data_file = datasets_root_path_ + "/testJiebaDataset/4.txt";
  827. std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
  828. std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
  829. std::shared_ptr<Dataset> ds = TextFile({data_file});
  830. EXPECT_NE(ds, nullptr);
  831. // Create jieba_tokenizer operation on ds
  832. std::shared_ptr<text::JiebaTokenizer> jieba_tokenizer =
  833. std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp);
  834. EXPECT_NE(jieba_tokenizer, nullptr);
  835. // Add word with freq not provided (default 0)
  836. ASSERT_OK(jieba_tokenizer->AddWord("男默女泪"));
  837. // Create Map operation on ds
  838. ds = ds->Map({jieba_tokenizer}, {"text"});
  839. EXPECT_NE(ds, nullptr);
  840. // Create an iterator over the result of the above dataset
  841. // This will trigger the creation of the Execution Tree and launch it.
  842. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  843. EXPECT_NE(iter, nullptr);
  844. // Iterate the dataset and get each row
  845. std::unordered_map<std::string, mindspore::MSTensor> row;
  846. ASSERT_OK(iter->GetNextRow(&row));
  847. std::vector<std::string> expected = {"男默女泪", "市", "长江大桥"};
  848. std::shared_ptr<Tensor> de_expected_tensor;
  849. ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
  850. mindspore::MSTensor expected_tensor =
  851. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  852. uint64_t i = 0;
  853. while (row.size() != 0) {
  854. auto ind = row["text"];
  855. EXPECT_MSTENSOR_EQ(ind, expected_tensor);
  856. ASSERT_OK(iter->GetNextRow(&row));
  857. i++;
  858. }
  859. EXPECT_EQ(i, 1);
  860. // Manually terminate the pipeline
  861. iter->Stop();
  862. }
  863. TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWord1) {
  864. // Testing the parameter AddWord of JiebaTokenizer when the freq is set explicitly to 0.
  865. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddWord1.";
  866. // Create a TextFile dataset
  867. std::string data_file = datasets_root_path_ + "/testJiebaDataset/4.txt";
  868. std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
  869. std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
  870. std::shared_ptr<Dataset> ds = TextFile({data_file});
  871. EXPECT_NE(ds, nullptr);
  872. // Create jieba_tokenizer operation on ds
  873. std::shared_ptr<text::JiebaTokenizer> jieba_tokenizer =
  874. std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp);
  875. EXPECT_NE(jieba_tokenizer, nullptr);
  876. // Add word with freq is set explicitly to 0
  877. ASSERT_OK(jieba_tokenizer->AddWord("男默女泪", 0));
  878. // Create Map operation on ds
  879. ds = ds->Map({jieba_tokenizer}, {"text"});
  880. EXPECT_NE(ds, nullptr);
  881. // Create an iterator over the result of the above dataset
  882. // This will trigger the creation of the Execution Tree and launch it.
  883. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  884. EXPECT_NE(iter, nullptr);
  885. // Iterate the dataset and get each row
  886. std::unordered_map<std::string, mindspore::MSTensor> row;
  887. ASSERT_OK(iter->GetNextRow(&row));
  888. std::vector<std::string> expected = {"男默女泪", "市", "长江大桥"};
  889. std::shared_ptr<Tensor> de_expected_tensor;
  890. ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
  891. mindspore::MSTensor expected_tensor =
  892. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  893. uint64_t i = 0;
  894. while (row.size() != 0) {
  895. auto ind = row["text"];
  896. EXPECT_MSTENSOR_EQ(ind, expected_tensor);
  897. ASSERT_OK(iter->GetNextRow(&row));
  898. i++;
  899. }
  900. EXPECT_EQ(i, 1);
  901. // Manually terminate the pipeline
  902. iter->Stop();
  903. }
  904. TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWord2) {
  905. // Testing the parameter AddWord of JiebaTokenizer when the freq is 10.
  906. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddWord2.";
  907. // Create a TextFile dataset
  908. std::string data_file = datasets_root_path_ + "/testJiebaDataset/4.txt";
  909. std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
  910. std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
  911. std::shared_ptr<Dataset> ds = TextFile({data_file});
  912. EXPECT_NE(ds, nullptr);
  913. // Create jieba_tokenizer operation on ds
  914. std::shared_ptr<text::JiebaTokenizer> jieba_tokenizer =
  915. std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp);
  916. EXPECT_NE(jieba_tokenizer, nullptr);
  917. // Add word with freq 10
  918. ASSERT_OK(jieba_tokenizer->AddWord("男默女泪", 10));
  919. // Create Map operation on ds
  920. ds = ds->Map({jieba_tokenizer}, {"text"});
  921. EXPECT_NE(ds, nullptr);
  922. // Create an iterator over the result of the above dataset
  923. // This will trigger the creation of the Execution Tree and launch it.
  924. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  925. EXPECT_NE(iter, nullptr);
  926. // Iterate the dataset and get each row
  927. std::unordered_map<std::string, mindspore::MSTensor> row;
  928. ASSERT_OK(iter->GetNextRow(&row));
  929. std::vector<std::string> expected = {"男默女泪", "市", "长江大桥"};
  930. std::shared_ptr<Tensor> de_expected_tensor;
  931. ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
  932. mindspore::MSTensor expected_tensor =
  933. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  934. uint64_t i = 0;
  935. while (row.size() != 0) {
  936. auto ind = row["text"];
  937. EXPECT_MSTENSOR_EQ(ind, expected_tensor);
  938. ASSERT_OK(iter->GetNextRow(&row));
  939. i++;
  940. }
  941. EXPECT_EQ(i, 1);
  942. // Manually terminate the pipeline
  943. iter->Stop();
  944. }
  945. TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWord3) {
  946. // Testing the parameter AddWord of JiebaTokenizer when the freq is 20000 which affects the result of segmentation.
  947. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddWord3.";
  948. // Create a TextFile dataset
  949. std::string data_file = datasets_root_path_ + "/testJiebaDataset/6.txt";
  950. std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
  951. std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
  952. std::shared_ptr<Dataset> ds = TextFile({data_file});
  953. EXPECT_NE(ds, nullptr);
  954. // Create jieba_tokenizer operation on ds
  955. std::shared_ptr<text::JiebaTokenizer> jieba_tokenizer =
  956. std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp);
  957. EXPECT_NE(jieba_tokenizer, nullptr);
  958. // Add word with freq 20000
  959. ASSERT_OK(jieba_tokenizer->AddWord("江大桥", 20000));
  960. // Create Map operation on ds
  961. ds = ds->Map({jieba_tokenizer}, {"text"});
  962. EXPECT_NE(ds, nullptr);
  963. // Create an iterator over the result of the above dataset
  964. // This will trigger the creation of the Execution Tree and launch it.
  965. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  966. EXPECT_NE(iter, nullptr);
  967. // Iterate the dataset and get each row
  968. std::unordered_map<std::string, mindspore::MSTensor> row;
  969. ASSERT_OK(iter->GetNextRow(&row));
  970. std::vector<std::string> expected = {"江州", "市长", "江大桥", "参加", "了", "长江大桥", "的", "通车", "仪式"};
  971. std::shared_ptr<Tensor> de_expected_tensor;
  972. ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
  973. mindspore::MSTensor expected_tensor =
  974. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  975. uint64_t i = 0;
  976. while (row.size() != 0) {
  977. auto ind = row["text"];
  978. EXPECT_MSTENSOR_EQ(ind, expected_tensor);
  979. ASSERT_OK(iter->GetNextRow(&row));
  980. i++;
  981. }
  982. EXPECT_EQ(i, 1);
  983. // Manually terminate the pipeline
  984. iter->Stop();
  985. }
  986. TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWordFail) {
  987. // Testing the incorrect parameter of AddWord in JiebaTokenizer.
  988. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddWordFail.";
  989. // Create a TextFile dataset
  990. std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
  991. std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
  992. std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
  993. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  994. EXPECT_NE(ds, nullptr);
  995. // Testing the parameter word of AddWord is empty
  996. std::shared_ptr<text::JiebaTokenizer> jieba_tokenizer =
  997. std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp);
  998. EXPECT_NE(jieba_tokenizer, nullptr);
  999. EXPECT_NE(jieba_tokenizer->AddWord("", 10), Status::OK());
  1000. // Testing the parameter freq of AddWord is negative
  1001. std::shared_ptr<text::JiebaTokenizer> jieba_tokenizer1 =
  1002. std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp);
  1003. EXPECT_NE(jieba_tokenizer1, nullptr);
  1004. EXPECT_NE(jieba_tokenizer1->AddWord("我们", -1), Status::OK());
  1005. }
  1006. TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddDict) {
  1007. // Testing AddDict of JiebaTokenizer when the input is a vector of word-freq pair.
  1008. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddDict.";
  1009. // Create a TextFile dataset
  1010. std::string data_file = datasets_root_path_ + "/testJiebaDataset/6.txt";
  1011. std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
  1012. std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
  1013. std::shared_ptr<Dataset> ds = TextFile({data_file});
  1014. EXPECT_NE(ds, nullptr);
  1015. // Create jieba_tokenizer operation on ds
  1016. std::shared_ptr<text::JiebaTokenizer> jieba_tokenizer =
  1017. std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp);
  1018. EXPECT_NE(jieba_tokenizer, nullptr);
  1019. // Add word with freq 20000
  1020. std::vector<std::pair<std::string, int64_t>> user_dict = {{"江大桥", 20000}};
  1021. ASSERT_OK(jieba_tokenizer->AddDict(user_dict));
  1022. // Create Map operation on ds
  1023. ds = ds->Map({jieba_tokenizer}, {"text"});
  1024. EXPECT_NE(ds, nullptr);
  1025. // Create an iterator over the result of the above dataset
  1026. // This will trigger the creation of the Execution Tree and launch it.
  1027. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  1028. EXPECT_NE(iter, nullptr);
  1029. // Iterate the dataset and get each row
  1030. std::unordered_map<std::string, mindspore::MSTensor> row;
  1031. ASSERT_OK(iter->GetNextRow(&row));
  1032. std::vector<std::string> expected = {"江州", "市长", "江大桥", "参加", "了", "长江大桥", "的", "通车", "仪式"};
  1033. std::shared_ptr<Tensor> de_expected_tensor;
  1034. ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
  1035. mindspore::MSTensor expected_tensor =
  1036. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  1037. uint64_t i = 0;
  1038. while (row.size() != 0) {
  1039. auto txt = row["text"];
  1040. EXPECT_MSTENSOR_EQ(txt, expected_tensor);
  1041. ASSERT_OK(iter->GetNextRow(&row));
  1042. i++;
  1043. }
  1044. EXPECT_EQ(i, 1);
  1045. // Manually terminate the pipeline
  1046. iter->Stop();
  1047. }
  1048. TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddDictFromFile) {
  1049. // Testing AddDict of JiebaTokenizer when the input is a path to dict.
  1050. // Test error scenario for AddDict: invalid path
  1051. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddDictFromFile.";
  1052. // Create a TextFile dataset
  1053. std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
  1054. std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
  1055. std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
  1056. std::shared_ptr<Dataset> ds = TextFile({data_file});
  1057. EXPECT_NE(ds, nullptr);
  1058. // Create jieba_tokenizer operation on ds
  1059. std::shared_ptr<text::JiebaTokenizer> jieba_tokenizer =
  1060. std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp);
  1061. EXPECT_NE(jieba_tokenizer, nullptr);
  1062. // Load dict from txt file
  1063. std::string user_dict_path = datasets_root_path_ + "/testJiebaDataset/user_dict.txt";
  1064. std::string invalid_path = datasets_root_path_ + "/testJiebaDataset/invalid_path.txt";
  1065. EXPECT_ERROR(jieba_tokenizer->AddDict(invalid_path));
  1066. ASSERT_OK(jieba_tokenizer->AddDict(user_dict_path));
  1067. // Create Map operation on ds
  1068. ds = ds->Map({jieba_tokenizer}, {"text"});
  1069. EXPECT_NE(ds, nullptr);
  1070. // Create an iterator over the result of the above dataset
  1071. // This will trigger the creation of the Execution Tree and launch it.
  1072. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  1073. EXPECT_NE(iter, nullptr);
  1074. // Iterate the dataset and get each row
  1075. std::unordered_map<std::string, mindspore::MSTensor> row;
  1076. ASSERT_OK(iter->GetNextRow(&row));
  1077. std::vector<std::string> expected = {"今天天气", "太好了", "我们", "一起", "去", "外面", "玩吧"};
  1078. std::shared_ptr<Tensor> de_expected_tensor;
  1079. ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
  1080. mindspore::MSTensor expected_tensor =
  1081. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  1082. uint64_t i = 0;
  1083. while (row.size() != 0) {
  1084. auto txt = row["text"];
  1085. EXPECT_MSTENSOR_EQ(txt, expected_tensor);
  1086. ASSERT_OK(iter->GetNextRow(&row));
  1087. i++;
  1088. }
  1089. EXPECT_EQ(i, 1);
  1090. // Manually terminate the pipeline
  1091. iter->Stop();
  1092. }
  1093. TEST_F(MindDataTestPipeline, TestSlidingWindowSuccess) {
  1094. // Testing the parameter of SlidingWindow interface when the axis is 0.
  1095. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSlidingWindowSuccess.";
  1096. // Create a TextFile dataset
  1097. std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
  1098. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  1099. EXPECT_NE(ds, nullptr);
  1100. // Create white_tokenizer operation on ds
  1101. std::shared_ptr<TensorTransform> white_tokenizer = std::make_shared<text::WhitespaceTokenizer>();
  1102. EXPECT_NE(white_tokenizer, nullptr);
  1103. // Create sliding_window operation on ds
  1104. std::shared_ptr<TensorTransform> sliding_window = std::make_shared<text::SlidingWindow>(3, 0);
  1105. EXPECT_NE(sliding_window, nullptr);
  1106. // Create Map operation on ds
  1107. ds = ds->Map({white_tokenizer, sliding_window}, {"text"});
  1108. EXPECT_NE(ds, nullptr);
  1109. // Create an iterator over the result of the above dataset
  1110. // This will trigger the creation of the Execution Tree and launch it.
  1111. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  1112. EXPECT_NE(iter, nullptr);
  1113. // Iterate the dataset and get each row
  1114. std::unordered_map<std::string, mindspore::MSTensor> row;
  1115. ASSERT_OK(iter->GetNextRow(&row));
  1116. std::vector<std::vector<std::string>> expected = {{"This", "is", "a", "is", "a", "text", "a", "text", "file."},
  1117. {"Be", "happy", "every", "happy", "every", "day."},
  1118. {"Good", "luck", "to", "luck", "to", "everyone."}};
  1119. uint64_t i = 0;
  1120. while (row.size() != 0) {
  1121. auto ind = row["text"];
  1122. std::shared_ptr<Tensor> de_expected_tensor;
  1123. int x = expected[i].size() / 3;
  1124. ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({x, 3}), &de_expected_tensor));
  1125. mindspore::MSTensor expected_tensor =
  1126. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  1127. EXPECT_MSTENSOR_EQ(ind, expected_tensor);
  1128. ASSERT_OK(iter->GetNextRow(&row));
  1129. i++;
  1130. }
  1131. EXPECT_EQ(i, 3);
  1132. // Manually terminate the pipeline
  1133. iter->Stop();
  1134. }
  1135. TEST_F(MindDataTestPipeline, TestSlidingWindowSuccess1) {
  1136. // Testing the parameter of SlidingWindow interface when the axis is -1.
  1137. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSlidingWindowSuccess1.";
  1138. // Create a TextFile dataset
  1139. std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
  1140. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  1141. EXPECT_NE(ds, nullptr);
  1142. // Create white_tokenizer operation on ds
  1143. std::shared_ptr<TensorTransform> white_tokenizer = std::make_shared<text::WhitespaceTokenizer>();
  1144. EXPECT_NE(white_tokenizer, nullptr);
  1145. // Create sliding_window operation on ds
  1146. std::shared_ptr<TensorTransform> sliding_window = std::make_shared<text::SlidingWindow>(2, -1);
  1147. EXPECT_NE(sliding_window, nullptr);
  1148. // Create Map operation on ds
  1149. ds = ds->Map({white_tokenizer, sliding_window}, {"text"});
  1150. EXPECT_NE(ds, nullptr);
  1151. // Create an iterator over the result of the above dataset
  1152. // This will trigger the creation of the Execution Tree and launch it.
  1153. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  1154. EXPECT_NE(iter, nullptr);
  1155. // Iterate the dataset and get each row
  1156. std::unordered_map<std::string, mindspore::MSTensor> row;
  1157. ASSERT_OK(iter->GetNextRow(&row));
  1158. std::vector<std::vector<std::string>> expected = {{"This", "is", "is", "a", "a", "text", "text", "file."},
  1159. {"Be", "happy", "happy", "every", "every", "day."},
  1160. {"Good", "luck", "luck", "to", "to", "everyone."}};
  1161. uint64_t i = 0;
  1162. while (row.size() != 0) {
  1163. auto ind = row["text"];
  1164. std::shared_ptr<Tensor> de_expected_tensor;
  1165. int x = expected[i].size() / 2;
  1166. ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({x, 2}), &de_expected_tensor));
  1167. mindspore::MSTensor expected_tensor =
  1168. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  1169. EXPECT_MSTENSOR_EQ(ind, expected_tensor);
  1170. ASSERT_OK(iter->GetNextRow(&row));
  1171. i++;
  1172. }
  1173. EXPECT_EQ(i, 3);
  1174. // Manually terminate the pipeline
  1175. iter->Stop();
  1176. }
  1177. TEST_F(MindDataTestPipeline, TestSlidingWindowFail1) {
  1178. // Testing the incorrect parameter of SlidingWindow interface.
  1179. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSlidingWindowFail1.";
  1180. // Create a TextFile dataset
  1181. std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
  1182. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  1183. EXPECT_NE(ds, nullptr);
  1184. // Create sliding_window operation on ds
  1185. // Testing the parameter width less than or equal to 0
  1186. // The parameter axis support 0 or -1 only for now
  1187. std::shared_ptr<TensorTransform> sliding_window = std::make_shared<text::SlidingWindow>(0, 0);
  1188. EXPECT_NE(sliding_window, nullptr);
  1189. // Create a Map operation on ds
  1190. ds = ds->Map({sliding_window});
  1191. EXPECT_NE(ds, nullptr);
  1192. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  1193. // Expect failure: invalid SlidingWindow input (width less than or equal to 0)
  1194. EXPECT_EQ(iter, nullptr);
  1195. }
  1196. TEST_F(MindDataTestPipeline, TestSlidingWindowFail2) {
  1197. // Testing the incorrect parameter of SlidingWindow interface.
  1198. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSlidingWindowFail2.";
  1199. // Create a TextFile dataset
  1200. std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
  1201. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  1202. EXPECT_NE(ds, nullptr);
  1203. // Create sliding_window operation on ds
  1204. // Testing the parameter width less than or equal to 0
  1205. // The parameter axis support 0 or -1 only for now
  1206. std::shared_ptr<TensorTransform> sliding_window = std::make_shared<text::SlidingWindow>(-2, 0);
  1207. EXPECT_NE(sliding_window, nullptr);
  1208. // Create a Map operation on ds
  1209. ds = ds->Map({sliding_window});
  1210. EXPECT_NE(ds, nullptr);
  1211. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  1212. // Expect failure: invalid SlidingWindow input (width less than or equal to 0)
  1213. EXPECT_EQ(iter, nullptr);
  1214. }
  1215. TEST_F(MindDataTestPipeline, TestToNumberSuccess1) {
  1216. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestToNumberSuccess1.";
  1217. // Test ToNumber with integer numbers
  1218. std::string data_file = datasets_root_path_ + "/testTokenizerData/to_number.txt";
  1219. // Create a TextFile dataset
  1220. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  1221. EXPECT_NE(ds, nullptr);
  1222. // Create a Take operation on ds
  1223. ds = ds->Take(8);
  1224. EXPECT_NE(ds, nullptr);
  1225. // Create ToNumber operation on ds
  1226. std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeInt64);
  1227. EXPECT_NE(to_number, nullptr);
  1228. // Create a Map operation on ds
  1229. ds = ds->Map({to_number}, {"text"});
  1230. EXPECT_NE(ds, nullptr);
  1231. // Create an iterator over the result of the above dataset
  1232. // This will trigger the creation of the Execution Tree and launch it.
  1233. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  1234. EXPECT_NE(iter, nullptr);
  1235. // Iterate the dataset and get each row
  1236. std::unordered_map<std::string, mindspore::MSTensor> row;
  1237. ASSERT_OK(iter->GetNextRow(&row));
  1238. std::vector<int64_t> expected = {-121, 14, -2219, 7623, -8162536, 162371864, -1726483716, 98921728421};
  1239. uint64_t i = 0;
  1240. while (row.size() != 0) {
  1241. auto ind = row["text"];
  1242. std::shared_ptr<Tensor> de_expected_tensor;
  1243. ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor));
  1244. mindspore::MSTensor ms_expected_tensor =
  1245. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  1246. EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor);
  1247. ASSERT_OK(iter->GetNextRow(&row));
  1248. i++;
  1249. }
  1250. EXPECT_EQ(i, 8);
  1251. // Manually terminate the pipeline
  1252. iter->Stop();
  1253. }
  1254. TEST_F(MindDataTestPipeline, TestToNumberSuccess2) {
  1255. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestToNumberSuccess2.";
  1256. // Test ToNumber with float numbers
  1257. std::string data_file = datasets_root_path_ + "/testTokenizerData/to_number.txt";
  1258. // Create a TextFile dataset
  1259. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  1260. EXPECT_NE(ds, nullptr);
  1261. // Create a Skip operation on ds
  1262. ds = ds->Skip(8);
  1263. EXPECT_NE(ds, nullptr);
  1264. // Create a Take operation on ds
  1265. ds = ds->Take(6);
  1266. EXPECT_NE(ds, nullptr);
  1267. // Create ToNumber operation on ds
  1268. std::shared_ptr<TensorTransform> to_number =
  1269. std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeFloat64);
  1270. EXPECT_NE(to_number, nullptr);
  1271. // Create a Map operation on ds
  1272. ds = ds->Map({to_number}, {"text"});
  1273. EXPECT_NE(ds, nullptr);
  1274. // Create an iterator over the result of the above dataset
  1275. // This will trigger the creation of the Execution Tree and launch it.
  1276. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  1277. EXPECT_NE(iter, nullptr);
  1278. // Iterate the dataset and get each row
  1279. std::unordered_map<std::string, mindspore::MSTensor> row;
  1280. ASSERT_OK(iter->GetNextRow(&row));
  1281. std::vector<double_t> expected = {-1.1, 1.4, -2219.321, 7623.453, -816256.234282, 162371864.243243};
  1282. uint64_t i = 0;
  1283. while (row.size() != 0) {
  1284. auto ind = row["text"];
  1285. std::shared_ptr<Tensor> de_expected_tensor;
  1286. ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor));
  1287. mindspore::MSTensor ms_expected_tensor =
  1288. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  1289. EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor);
  1290. ASSERT_OK(iter->GetNextRow(&row));
  1291. i++;
  1292. }
  1293. EXPECT_EQ(i, 6);
  1294. // Manually terminate the pipeline
  1295. iter->Stop();
  1296. }
  1297. TEST_F(MindDataTestPipeline, TestToNumberFail1) {
  1298. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestToNumberFail1.";
  1299. // Test ToNumber with overflow integer numbers
  1300. std::string data_file = datasets_root_path_ + "/testTokenizerData/to_number.txt";
  1301. // Create a TextFile dataset
  1302. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  1303. EXPECT_NE(ds, nullptr);
  1304. // Create a Skip operation on ds
  1305. ds = ds->Skip(2);
  1306. EXPECT_NE(ds, nullptr);
  1307. // Create a Take operation on ds
  1308. ds = ds->Take(6);
  1309. EXPECT_NE(ds, nullptr);
  1310. // Create ToNumber operation on ds
  1311. std::shared_ptr<TensorTransform> to_number =
  1312. std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeInt8);
  1313. EXPECT_NE(to_number, nullptr);
  1314. // Create a Map operation on ds
  1315. ds = ds->Map({to_number}, {"text"});
  1316. EXPECT_NE(ds, nullptr);
  1317. // Create an iterator over the result of the above dataset
  1318. // This will trigger the creation of the Execution Tree and launch it.
  1319. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  1320. EXPECT_NE(iter, nullptr);
  1321. // Iterate the dataset and get each row
  1322. std::unordered_map<std::string, mindspore::MSTensor> row;
  1323. // Expect error: input out of bounds of int8
  1324. EXPECT_ERROR(iter->GetNextRow(&row));
  1325. uint64_t i = 0;
  1326. while (row.size() != 0) {
  1327. EXPECT_ERROR(iter->GetNextRow(&row));
  1328. i++;
  1329. }
  1330. // Expect failure: GetNextRow fail and return nothing
  1331. EXPECT_EQ(i, 0);
  1332. // Manually terminate the pipeline
  1333. iter->Stop();
  1334. }
  1335. TEST_F(MindDataTestPipeline, TestToNumberFail2) {
  1336. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestToNumberFail2.";
  1337. // Test ToNumber with overflow float numbers
  1338. std::string data_file = datasets_root_path_ + "/testTokenizerData/to_number.txt";
  1339. // Create a TextFile dataset
  1340. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  1341. EXPECT_NE(ds, nullptr);
  1342. // Create a Skip operation on ds
  1343. ds = ds->Skip(12);
  1344. EXPECT_NE(ds, nullptr);
  1345. // Create a Take operation on ds
  1346. ds = ds->Take(2);
  1347. EXPECT_NE(ds, nullptr);
  1348. // Create ToNumber operation on ds
  1349. std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeFloat16);
  1350. EXPECT_NE(to_number, nullptr);
  1351. // Create a Map operation on ds
  1352. ds = ds->Map({to_number}, {"text"});
  1353. EXPECT_NE(ds, nullptr);
  1354. // Create an iterator over the result of the above dataset
  1355. // This will trigger the creation of the Execution Tree and launch it.
  1356. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  1357. EXPECT_NE(iter, nullptr);
  1358. // Iterate the dataset and get each row
  1359. std::unordered_map<std::string, mindspore::MSTensor> row;
  1360. // Expect error: input out of bounds of float16
  1361. EXPECT_ERROR(iter->GetNextRow(&row));
  1362. uint64_t i = 0;
  1363. while (row.size() != 0) {
  1364. EXPECT_ERROR(iter->GetNextRow(&row));
  1365. i++;
  1366. }
  1367. // Expect failure: GetNextRow fail and return nothing
  1368. EXPECT_EQ(i, 0);
  1369. // Manually terminate the pipeline
  1370. iter->Stop();
  1371. }
  1372. TEST_F(MindDataTestPipeline, TestToNumberFail3) {
  1373. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestToNumberFail3.";
  1374. // Test ToNumber with non numerical input
  1375. std::string data_file = datasets_root_path_ + "/testTokenizerData/to_number.txt";
  1376. // Create a TextFile dataset
  1377. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  1378. EXPECT_NE(ds, nullptr);
  1379. // Create a Skip operation on ds
  1380. ds = ds->Skip(14);
  1381. EXPECT_NE(ds, nullptr);
  1382. // Create ToNumber operation on ds
  1383. std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeInt64);
  1384. EXPECT_NE(to_number, nullptr);
  1385. // Create a Map operation on ds
  1386. ds = ds->Map({to_number}, {"text"});
  1387. EXPECT_NE(ds, nullptr);
  1388. // Create an iterator over the result of the above dataset
  1389. // This will trigger the creation of the Execution Tree and launch it.
  1390. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  1391. EXPECT_NE(iter, nullptr);
  1392. // Iterate the dataset and get each row
  1393. std::unordered_map<std::string, mindspore::MSTensor> row;
  1394. // Expect error: invalid input which is non numerical
  1395. EXPECT_ERROR(iter->GetNextRow(&row));
  1396. uint64_t i = 0;
  1397. while (row.size() != 0) {
  1398. EXPECT_ERROR(iter->GetNextRow(&row));
  1399. i++;
  1400. }
  1401. // Expect failure: GetNextRow fail and return nothing
  1402. EXPECT_EQ(i, 0);
  1403. // Manually terminate the pipeline
  1404. iter->Stop();
  1405. }
  1406. TEST_F(MindDataTestPipeline, TestToNumberFail4) {
  1407. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestToNumberFail4.";
  1408. // Test ToNumber with non numerical data type
  1409. std::string data_file = datasets_root_path_ + "/testTokenizerData/to_number.txt";
  1410. // Create a TextFile dataset
  1411. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  1412. EXPECT_NE(ds, nullptr);
  1413. // Create ToNumber operation on ds
  1414. std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kObjectTypeString);
  1415. EXPECT_NE(to_number, nullptr);
  1416. // Create a Map operation on ds
  1417. ds = ds->Map({to_number}, {"text"});
  1418. EXPECT_NE(ds, nullptr);
  1419. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  1420. // Expect failure: invalid parameter with non numerical data type
  1421. EXPECT_EQ(iter, nullptr);
  1422. }
  1423. TEST_F(MindDataTestPipeline, TestToNumberFail5) {
  1424. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestToNumberFail5.";
  1425. // Test ToNumber with non numerical data type
  1426. std::string data_file = datasets_root_path_ + "/testTokenizerData/to_number.txt";
  1427. // Create a TextFile dataset
  1428. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  1429. EXPECT_NE(ds, nullptr);
  1430. // Create ToNumber operation on ds
  1431. std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeBool);
  1432. EXPECT_NE(to_number, nullptr);
  1433. // Create a Map operation on ds
  1434. ds = ds->Map({to_number}, {"text"});
  1435. EXPECT_NE(ds, nullptr);
  1436. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  1437. // Expect failure: invalid parameter with non numerical data type
  1438. EXPECT_EQ(iter, nullptr);
  1439. }
  1440. TEST_F(MindDataTestPipeline, TestTruncateSequencePairSuccess1) {
  1441. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestTruncateSequencePairSuccess1.";
  1442. // Testing basic TruncateSequencePair
  1443. // Set seed for RandomDataset
  1444. auto original_seed = config::get_seed();
  1445. bool status_set_seed = config::set_seed(0);
  1446. EXPECT_EQ(status_set_seed, true);
  1447. // Set num_parallel_workers for RandomDataset
  1448. auto original_worker = config::get_num_parallel_workers();
  1449. bool status_set_worker = config::set_num_parallel_workers(1);
  1450. EXPECT_EQ(status_set_worker, true);
  1451. // Create a RandomDataset which has column names "col1" and "col2"
  1452. std::shared_ptr<SchemaObj> schema = Schema();
  1453. ASSERT_OK(schema->add_column("col1", mindspore::DataType::kNumberTypeInt16, {5}));
  1454. ASSERT_OK(schema->add_column("col2", mindspore::DataType::kNumberTypeInt32, {3}));
  1455. std::shared_ptr<Dataset> ds = RandomData(3, schema);
  1456. EXPECT_NE(ds, nullptr);
  1457. // Create a truncate_sequence_pair operation on ds
  1458. std::shared_ptr<TensorTransform> truncate_sequence_pair = std::make_shared<text::TruncateSequencePair>(4);
  1459. EXPECT_NE(truncate_sequence_pair, nullptr);
  1460. // Create Map operation on ds
  1461. ds = ds->Map({truncate_sequence_pair}, {"col1", "col2"});
  1462. EXPECT_NE(ds, nullptr);
  1463. // Create an iterator over the result of the above dataset
  1464. // This will trigger the creation of the Execution Tree and launch it.
  1465. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  1466. EXPECT_NE(iter, nullptr);
  1467. // Iterate the dataset and get each row
  1468. std::unordered_map<std::string, mindspore::MSTensor> row;
  1469. ASSERT_OK(iter->GetNextRow(&row));
  1470. std::vector<std::vector<int16_t>> expected1 = {{-29556, -29556}, {-18505, -18505}, {-25958, -25958}};
  1471. std::vector<std::vector<int32_t>> expected2 = {
  1472. {-1751672937, -1751672937}, {-656877352, -656877352}, {-606348325, -606348325}};
  1473. uint64_t i = 0;
  1474. while (row.size() != 0) {
  1475. auto ind1 = row["col1"];
  1476. auto ind2 = row["col2"];
  1477. std::shared_ptr<Tensor> de_expected_tensor1;
  1478. ASSERT_OK(Tensor::CreateFromVector(expected1[i], &de_expected_tensor1));
  1479. mindspore::MSTensor expected_tensor1 =
  1480. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor1));
  1481. EXPECT_MSTENSOR_EQ(ind1, expected_tensor1);
  1482. std::shared_ptr<Tensor> de_expected_tensor2;
  1483. ASSERT_OK(Tensor::CreateFromVector(expected2[i], &de_expected_tensor2));
  1484. mindspore::MSTensor expected_tensor2 =
  1485. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor2));
  1486. EXPECT_MSTENSOR_EQ(ind2, expected_tensor2);
  1487. ASSERT_OK(iter->GetNextRow(&row));
  1488. i++;
  1489. }
  1490. EXPECT_EQ(i, 3);
  1491. // Manually terminate the pipeline
  1492. iter->Stop();
  1493. // Restore original seed and num_parallel_workers
  1494. status_set_seed = config::set_seed(original_seed);
  1495. EXPECT_EQ(status_set_seed, true);
  1496. status_set_worker = config::set_num_parallel_workers(original_worker);
  1497. EXPECT_EQ(status_set_worker, true);
  1498. }
  1499. TEST_F(MindDataTestPipeline, TestTruncateSequencePairSuccess2) {
  1500. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestTruncateSequencePairSuccess2.";
  1501. // Testing basic TruncateSequencePair with odd max_length
  1502. // Set seed for RandomDataset
  1503. auto original_seed = config::get_seed();
  1504. bool status_set_seed = config::set_seed(1);
  1505. EXPECT_EQ(status_set_seed, true);
  1506. // Set num_parallel_workers for RandomDataset
  1507. auto original_worker = config::get_num_parallel_workers();
  1508. bool status_set_worker = config::set_num_parallel_workers(1);
  1509. EXPECT_EQ(status_set_worker, true);
  1510. // Create a RandomDataset which has column names "col1" and "col2"
  1511. std::shared_ptr<SchemaObj> schema = Schema();
  1512. ASSERT_OK(schema->add_column("col1", mindspore::DataType::kNumberTypeInt32, {4}));
  1513. ASSERT_OK(schema->add_column("col2", mindspore::DataType::kNumberTypeInt64, {4}));
  1514. std::shared_ptr<Dataset> ds = RandomData(4, schema);
  1515. EXPECT_NE(ds, nullptr);
  1516. // Create a truncate_sequence_pair operation on ds
  1517. std::shared_ptr<TensorTransform> truncate_sequence_pair = std::make_shared<text::TruncateSequencePair>(5);
  1518. EXPECT_NE(truncate_sequence_pair, nullptr);
  1519. // Create Map operation on ds
  1520. ds = ds->Map({truncate_sequence_pair}, {"col1", "col2"});
  1521. EXPECT_NE(ds, nullptr);
  1522. // Create an iterator over the result of the above dataset
  1523. // This will trigger the creation of the Execution Tree and launch it.
  1524. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  1525. EXPECT_NE(iter, nullptr);
  1526. // Iterate the dataset and get each row
  1527. std::unordered_map<std::string, mindspore::MSTensor> row;
  1528. ASSERT_OK(iter->GetNextRow(&row));
  1529. std::vector<std::vector<int32_t>> expected1 = {{1785358954, 1785358954, 1785358954},
  1530. {-1195853640, -1195853640, -1195853640},
  1531. {0, 0, 0},
  1532. {1296911693, 1296911693, 1296911693}};
  1533. std::vector<std::vector<int64_t>> expected2 = {
  1534. {-1, -1}, {-1229782938247303442, -1229782938247303442}, {2314885530818453536, 2314885530818453536}, {-1, -1}};
  1535. uint64_t i = 0;
  1536. while (row.size() != 0) {
  1537. auto ind1 = row["col1"];
  1538. auto ind2 = row["col2"];
  1539. std::shared_ptr<Tensor> de_expected_tensor1;
  1540. ASSERT_OK(Tensor::CreateFromVector(expected1[i], &de_expected_tensor1));
  1541. mindspore::MSTensor expected_tensor1 =
  1542. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor1));
  1543. EXPECT_MSTENSOR_EQ(ind1, expected_tensor1);
  1544. std::shared_ptr<Tensor> de_expected_tensor2;
  1545. ASSERT_OK(Tensor::CreateFromVector(expected2[i], &de_expected_tensor2));
  1546. mindspore::MSTensor expected_tensor2 =
  1547. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor2));
  1548. EXPECT_MSTENSOR_EQ(ind2, expected_tensor2);
  1549. ASSERT_OK(iter->GetNextRow(&row));
  1550. i++;
  1551. }
  1552. EXPECT_EQ(i, 4);
  1553. // Manually terminate the pipeline
  1554. iter->Stop();
  1555. // Restore original seed and num_parallel_workers
  1556. status_set_seed = config::set_seed(original_seed);
  1557. EXPECT_EQ(status_set_seed, true);
  1558. status_set_worker = config::set_num_parallel_workers(original_worker);
  1559. EXPECT_EQ(status_set_worker, true);
  1560. }
  1561. TEST_F(MindDataTestPipeline, TestTruncateSequencePairFail) {
  1562. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestTruncateSequencePairFail.";
  1563. // Testing TruncateSequencePair with negative max_length
  1564. // Create a RandomDataset which has column names "col1" and "col2"
  1565. std::shared_ptr<SchemaObj> schema = Schema();
  1566. ASSERT_OK(schema->add_column("col1", mindspore::DataType::kNumberTypeInt8, {3}));
  1567. ASSERT_OK(schema->add_column("col2", mindspore::DataType::kNumberTypeInt8, {3}));
  1568. std::shared_ptr<Dataset> ds = RandomData(3, schema);
  1569. EXPECT_NE(ds, nullptr);
  1570. // Create a truncate_sequence_pair operation on ds
  1571. std::shared_ptr<TensorTransform> truncate_sequence_pair = std::make_shared<text::TruncateSequencePair>(-1);
  1572. EXPECT_NE(truncate_sequence_pair, nullptr);
  1573. // Create a Map operation on ds
  1574. ds = ds->Map({truncate_sequence_pair});
  1575. EXPECT_NE(ds, nullptr);
  1576. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  1577. // Expect failure: invalid TruncateSequencePair input (invalid parameter with negative max_length)
  1578. EXPECT_EQ(iter, nullptr);
  1579. }
  1580. TEST_F(MindDataTestPipeline, TestNgramSuccess) {
  1581. // Testing the parameter of Ngram interface.
  1582. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNgramSuccess.";
  1583. // Create a TextFile dataset
  1584. std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
  1585. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  1586. EXPECT_NE(ds, nullptr);
  1587. // Create white_tokenizer operation on ds
  1588. std::shared_ptr<TensorTransform> white_tokenizer = std::make_shared<text::WhitespaceTokenizer>();
  1589. EXPECT_NE(white_tokenizer, nullptr);
  1590. // Create sliding_window operation on ds
  1591. std::shared_ptr<TensorTransform> ngram_op(new text::Ngram({2}, {"_", 1}, {"_", 1}, " "));
  1592. EXPECT_NE(ngram_op, nullptr);
  1593. // Create Map operation on ds
  1594. ds = ds->Map({white_tokenizer, ngram_op}, {"text"});
  1595. EXPECT_NE(ds, nullptr);
  1596. // Create an iterator over the result of the above dataset
  1597. // This will trigger the creation of the Execution Tree and launch it.
  1598. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  1599. EXPECT_NE(iter, nullptr);
  1600. // Iterate the dataset and get each row
  1601. std::unordered_map<std::string, mindspore::MSTensor> row;
  1602. ASSERT_OK(iter->GetNextRow(&row));
  1603. std::vector<std::vector<std::string>> expected = {{"_ This", "This is", "is a", "a text", "text file.", "file. _"},
  1604. {"_ Be", "Be happy", "happy every", "every day.", "day. _"},
  1605. {"_ Good", "Good luck", "luck to", "to everyone.", "everyone. _"}};
  1606. uint64_t i = 0;
  1607. while (row.size() != 0) {
  1608. auto ind = row["text"];
  1609. std::shared_ptr<Tensor> de_expected_tensor;
  1610. int x = expected[i].size();
  1611. ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({x}), &de_expected_tensor));
  1612. mindspore::MSTensor expected_tensor =
  1613. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  1614. EXPECT_MSTENSOR_EQ(ind, expected_tensor);
  1615. ASSERT_OK(iter->GetNextRow(&row));
  1616. i++;
  1617. }
  1618. EXPECT_EQ(i, 3);
  1619. // Manually terminate the pipeline
  1620. iter->Stop();
  1621. }
  1622. TEST_F(MindDataTestPipeline, TestNgramSuccess1) {
  1623. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNgramSuccess1.";
  1624. // Create a TextFile dataset
  1625. std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
  1626. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  1627. EXPECT_NE(ds, nullptr);
  1628. // Create white_tokenizer operation on ds
  1629. std::shared_ptr<TensorTransform> white_tokenizer = std::make_shared<text::WhitespaceTokenizer>();
  1630. EXPECT_NE(white_tokenizer, nullptr);
  1631. // Create sliding_window operation on ds
  1632. std::shared_ptr<TensorTransform> ngram_op(new text::Ngram({2, 3}, {"&", 2}, {"&", 2}, "-"));
  1633. EXPECT_NE(ngram_op, nullptr);
  1634. // Create Map operation on ds
  1635. ds = ds->Map({white_tokenizer, ngram_op}, {"text"});
  1636. EXPECT_NE(ds, nullptr);
  1637. // Create an iterator over the result of the above dataset
  1638. // This will trigger the creation of the Execution Tree and launch it.
  1639. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  1640. EXPECT_NE(iter, nullptr);
  1641. // Iterate the dataset and get each row
  1642. std::unordered_map<std::string, mindspore::MSTensor> row;
  1643. ASSERT_OK(iter->GetNextRow(&row));
  1644. std::vector<std::vector<std::string>> expected = {
  1645. {"&-This", "This-is", "is-a", "a-text", "text-file.", "file.-&", "&-&-This", "&-This-is", "This-is-a",
  1646. "is-a-text",
  1647. "a-text-file.", "text-file.-&", "file.-&-&"},
  1648. {"&-Be", "Be-happy", "happy-every", "every-day.", "day.-&", "&-&-Be", "&-Be-happy", "Be-happy-every",
  1649. "happy-every-day.", "every-day.-&", "day.-&-&"},
  1650. {"&-Good", "Good-luck", "luck-to", "to-everyone.", "everyone.-&", "&-&-Good", "&-Good-luck", "Good-luck-to",
  1651. "luck-to-everyone.", "to-everyone.-&", "everyone.-&-&"}};
  1652. uint64_t i = 0;
  1653. while (row.size() != 0) {
  1654. auto ind = row["text"];
  1655. std::shared_ptr<Tensor> de_expected_tensor;
  1656. int x = expected[i].size();
  1657. ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({x}), &de_expected_tensor));
  1658. mindspore::MSTensor expected_tensor =
  1659. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  1660. EXPECT_MSTENSOR_EQ(ind, expected_tensor);
  1661. ASSERT_OK(iter->GetNextRow(&row));
  1662. i++;
  1663. }
  1664. EXPECT_EQ(i, 3);
  1665. // Manually terminate the pipeline
  1666. iter->Stop();
  1667. }
  1668. TEST_F(MindDataTestPipeline, TestNgramFail1) {
  1669. // Testing the incorrect parameter of Ngram interface.
  1670. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNgramFail1.";
  1671. // Create a TextFile dataset
  1672. std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
  1673. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  1674. EXPECT_NE(ds, nullptr);
  1675. // Create sliding_window operation on ds
  1676. // Testing the vector of ngram is empty
  1677. std::shared_ptr<TensorTransform> ngram_op(new text::Ngram({}));
  1678. EXPECT_NE(ngram_op, nullptr);
  1679. // Create a Map operation on ds
  1680. ds = ds->Map({ngram_op});
  1681. EXPECT_NE(ds, nullptr);
  1682. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  1683. // Expect failure: invalid Ngram input (the vector of ngram is empty)
  1684. EXPECT_EQ(iter, nullptr);
  1685. }
  1686. TEST_F(MindDataTestPipeline, TestNgramFail2) {
  1687. // Testing the incorrect parameter of Ngram interface.
  1688. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNgramFail2.";
  1689. // Create a TextFile dataset
  1690. std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
  1691. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  1692. EXPECT_NE(ds, nullptr);
  1693. // Create sliding_window operation on ds
  1694. // Testing the value of ngrams vector less than and equal to 0
  1695. std::shared_ptr<TensorTransform> ngram_op(new text::Ngram({0}));
  1696. EXPECT_NE(ngram_op, nullptr);
  1697. // Create a Map operation on ds
  1698. ds = ds->Map({ngram_op});
  1699. EXPECT_NE(ds, nullptr);
  1700. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  1701. // Expect failure: invalid Ngram input (the value of ngrams vector less than and equal to 0)
  1702. EXPECT_EQ(iter, nullptr);
  1703. }
  1704. TEST_F(MindDataTestPipeline, TestNgramFail3) {
  1705. // Testing the incorrect parameter of Ngram interface.
  1706. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNgramFail3.";
  1707. // Create a TextFile dataset
  1708. std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
  1709. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  1710. EXPECT_NE(ds, nullptr);
  1711. // Create sliding_window operation on ds
  1712. // Testing the value of ngrams vector less than and equal to 0
  1713. std::shared_ptr<TensorTransform> ngram_op(new text::Ngram({-2}));
  1714. EXPECT_NE(ngram_op, nullptr);
  1715. // Create a Map operation on ds
  1716. ds = ds->Map({ngram_op});
  1717. EXPECT_NE(ds, nullptr);
  1718. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  1719. // Expect failure: invalid Ngram input (the value of ngrams vector less than and equal to 0)
  1720. EXPECT_EQ(iter, nullptr);
  1721. }
  1722. TEST_F(MindDataTestPipeline, TestNgramFail4) {
  1723. // Testing the incorrect parameter of Ngram interface.
  1724. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNgramFail4.";
  1725. // Create a TextFile dataset
  1726. std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
  1727. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  1728. EXPECT_NE(ds, nullptr);
  1729. // Create sliding_window operation on ds
  1730. // Testing the second parameter pad_width in left_pad vector less than 0
  1731. std::shared_ptr<TensorTransform> ngram_op(new text::Ngram({2}, {"", -1}));
  1732. EXPECT_NE(ngram_op, nullptr);
  1733. // Create a Map operation on ds
  1734. ds = ds->Map({ngram_op});
  1735. EXPECT_NE(ds, nullptr);
  1736. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  1737. // Expect failure: invalid Ngram input (the second parameter pad_width in left_pad vector less than 0)
  1738. EXPECT_EQ(iter, nullptr);
  1739. }
  1740. TEST_F(MindDataTestPipeline, TestNgramFail5) {
  1741. // Testing the incorrect parameter of Ngram interface.
  1742. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNgramFail5.";
  1743. // Create a TextFile dataset
  1744. std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
  1745. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  1746. EXPECT_NE(ds, nullptr);
  1747. // Create sliding_window operation on ds
  1748. // Testing the second parameter pad_width in right_pad vector less than 0
  1749. std::shared_ptr<TensorTransform> ngram_op(new text::Ngram({2}, {"", 1}, {"", -1}));
  1750. EXPECT_NE(ngram_op, nullptr);
  1751. // Create a Map operation on ds
  1752. ds = ds->Map({ngram_op});
  1753. EXPECT_NE(ds, nullptr);
  1754. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  1755. // Expect failure: invalid Ngram input (the second parameter pad_width in left_pad vector less than 0)
  1756. EXPECT_EQ(iter, nullptr);
  1757. }
  1758. TEST_F(MindDataTestPipeline, TestNormalizeUTF8Success) {
  1759. // Testing the parameter of NormalizeUTF8 interface when the normalize_form is NormalizeForm::kNfkc.
  1760. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNormalizeUTF8Success.";
  1761. // Create a TextFile dataset
  1762. std::string data_file = datasets_root_path_ + "/testTokenizerData/normalize.txt";
  1763. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  1764. EXPECT_NE(ds, nullptr);
  1765. // Create normalizeutf8 operation on ds
  1766. std::shared_ptr<TensorTransform> normalizeutf8 = std::make_shared<text::NormalizeUTF8>(NormalizeForm::kNfkc);
  1767. EXPECT_NE(normalizeutf8, nullptr);
  1768. // Create Map operation on ds
  1769. ds = ds->Map({normalizeutf8}, {"text"});
  1770. EXPECT_NE(ds, nullptr);
  1771. // Create an iterator over the result of the above dataset
  1772. // This will trigger the creation of the Execution Tree and launch it.
  1773. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  1774. EXPECT_NE(iter, nullptr);
  1775. // Iterate the dataset and get each row
  1776. std::unordered_map<std::string, mindspore::MSTensor> row;
  1777. ASSERT_OK(iter->GetNextRow(&row));
  1778. std::vector<std::string> expected = {"ṩ", "ḍ̇", "q̣̇", "fi", "25", "ṩ"};
  1779. uint64_t i = 0;
  1780. while (row.size() != 0) {
  1781. auto ind = row["text"];
  1782. std::shared_ptr<Tensor> de_expected_tensor;
  1783. ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor));
  1784. mindspore::MSTensor ms_expected_tensor =
  1785. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  1786. EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor);
  1787. ASSERT_OK(iter->GetNextRow(&row));
  1788. i++;
  1789. }
  1790. EXPECT_EQ(i, 6);
  1791. // Manually terminate the pipeline
  1792. iter->Stop();
  1793. }
  1794. TEST_F(MindDataTestPipeline, TestNormalizeUTF8Success1) {
  1795. // Testing the parameter of NormalizeUTF8 interface when the normalize_form is NormalizeForm::kNfc.
  1796. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNormalizeUTF8Success1.";
  1797. // Create a TextFile dataset
  1798. std::string data_file = datasets_root_path_ + "/testTokenizerData/normalize.txt";
  1799. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  1800. EXPECT_NE(ds, nullptr);
  1801. // Create normalizeutf8 operation on ds
  1802. std::shared_ptr<TensorTransform> normalizeutf8 = std::make_shared<text::NormalizeUTF8>(NormalizeForm::kNfc);
  1803. EXPECT_NE(normalizeutf8, nullptr);
  1804. // Create Map operation on ds
  1805. ds = ds->Map({normalizeutf8}, {"text"});
  1806. EXPECT_NE(ds, nullptr);
  1807. // Create an iterator over the result of the above dataset
  1808. // This will trigger the creation of the Execution Tree and launch it.
  1809. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  1810. EXPECT_NE(iter, nullptr);
  1811. // Iterate the dataset and get each row
  1812. std::unordered_map<std::string, mindspore::MSTensor> row;
  1813. ASSERT_OK(iter->GetNextRow(&row));
  1814. std::vector<std::string> expected = {"ṩ", "ḍ̇", "q̣̇", "fi", "2⁵", "ẛ̣"};
  1815. uint64_t i = 0;
  1816. while (row.size() != 0) {
  1817. auto ind = row["text"];
  1818. std::shared_ptr<Tensor> de_expected_tensor;
  1819. ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor));
  1820. mindspore::MSTensor ms_expected_tensor =
  1821. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  1822. EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor);
  1823. ASSERT_OK(iter->GetNextRow(&row));
  1824. i++;
  1825. }
  1826. EXPECT_EQ(i, 6);
  1827. // Manually terminate the pipeline
  1828. iter->Stop();
  1829. }
  1830. TEST_F(MindDataTestPipeline, TestNormalizeUTF8Success2) {
  1831. // Testing the parameter of NormalizeUTF8 interface when the normalize_form is NormalizeForm::kNfd.
  1832. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNormalizeUTF8Success2.";
  1833. // Create a TextFile dataset
  1834. std::string data_file = datasets_root_path_ + "/testTokenizerData/normalize.txt";
  1835. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  1836. EXPECT_NE(ds, nullptr);
  1837. // Create normalizeutf8 operation on ds
  1838. std::shared_ptr<TensorTransform> normalizeutf8 = std::make_shared<text::NormalizeUTF8>(NormalizeForm::kNfd);
  1839. EXPECT_NE(normalizeutf8, nullptr);
  1840. // Create Map operation on ds
  1841. ds = ds->Map({normalizeutf8}, {"text"});
  1842. EXPECT_NE(ds, nullptr);
  1843. // Create an iterator over the result of the above dataset
  1844. // This will trigger the creation of the Execution Tree and launch it.
  1845. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  1846. EXPECT_NE(iter, nullptr);
  1847. // Iterate the dataset and get each row
  1848. std::unordered_map<std::string, mindspore::MSTensor> row;
  1849. ASSERT_OK(iter->GetNextRow(&row));
  1850. std::vector<std::string> expected = {"ṩ", "ḍ̇", "q̣̇", "fi", "2⁵", "ẛ̣"};
  1851. uint64_t i = 0;
  1852. while (row.size() != 0) {
  1853. auto ind = row["text"];
  1854. std::shared_ptr<Tensor> de_expected_tensor;
  1855. ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor));
  1856. mindspore::MSTensor ms_expected_tensor =
  1857. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  1858. EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor);
  1859. ASSERT_OK(iter->GetNextRow(&row));
  1860. i++;
  1861. }
  1862. EXPECT_EQ(i, 6);
  1863. // Manually terminate the pipeline
  1864. iter->Stop();
  1865. }
  1866. TEST_F(MindDataTestPipeline, TestNormalizeUTF8Success3) {
  1867. // Testing the parameter of NormalizeUTF8 interface when the normalize_form is NormalizeForm::kNfkd.
  1868. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNormalizeUTF8Success3.";
  1869. // Create a TextFile dataset
  1870. std::string data_file = datasets_root_path_ + "/testTokenizerData/normalize.txt";
  1871. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  1872. EXPECT_NE(ds, nullptr);
  1873. // Create normalizeutf8 operation on ds
  1874. std::shared_ptr<TensorTransform> normalizeutf8 = std::make_shared<text::NormalizeUTF8>(NormalizeForm::kNfkd);
  1875. EXPECT_NE(normalizeutf8, nullptr);
  1876. // Create Map operation on ds
  1877. ds = ds->Map({normalizeutf8}, {"text"});
  1878. EXPECT_NE(ds, nullptr);
  1879. // Create an iterator over the result of the above dataset
  1880. // This will trigger the creation of the Execution Tree and launch it.
  1881. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  1882. EXPECT_NE(iter, nullptr);
  1883. // Iterate the dataset and get each row
  1884. std::unordered_map<std::string, mindspore::MSTensor> row;
  1885. ASSERT_OK(iter->GetNextRow(&row));
  1886. std::vector<std::string> expected = {"ṩ", "ḍ̇", "q̣̇", "fi", "25", "ṩ"};
  1887. uint64_t i = 0;
  1888. while (row.size() != 0) {
  1889. auto ind = row["text"];
  1890. std::shared_ptr<Tensor> de_expected_tensor;
  1891. ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor));
  1892. mindspore::MSTensor ms_expected_tensor =
  1893. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  1894. EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor);
  1895. ASSERT_OK(iter->GetNextRow(&row));
  1896. i++;
  1897. }
  1898. EXPECT_EQ(i, 6);
  1899. // Manually terminate the pipeline
  1900. iter->Stop();
  1901. }
  1902. TEST_F(MindDataTestPipeline, TestRegexReplaceSuccess) {
  1903. // Testing the parameter of RegexReplace interface when the replace_all is true.
  1904. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRegexReplaceSuccess.";
  1905. // Create a TextFile dataset
  1906. std::string data_file = datasets_root_path_ + "/testTokenizerData/regex_replace.txt";
  1907. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  1908. EXPECT_NE(ds, nullptr);
  1909. // Create regex_replace operation on ds
  1910. std::shared_ptr<TensorTransform> regex_replace = std::make_shared<text::RegexReplace>("\\s+", "_", true);
  1911. EXPECT_NE(regex_replace, nullptr);
  1912. // Create Map operation on ds
  1913. ds = ds->Map({regex_replace}, {"text"});
  1914. EXPECT_NE(ds, nullptr);
  1915. // Create an iterator over the result of the above dataset
  1916. // This will trigger the creation of the Execution Tree and launch it.
  1917. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  1918. EXPECT_NE(iter, nullptr);
  1919. // Iterate the dataset and get each row
  1920. std::unordered_map<std::string, mindspore::MSTensor> row;
  1921. ASSERT_OK(iter->GetNextRow(&row));
  1922. std::vector<std::string> expected = {"Hello_World", "Let's_Go", "1:hello", "2:world",
  1923. "31:beijing", "Welcome_to_China!", "_我_不想_长大_", "Welcome_to_Shenzhen!"};
  1924. uint64_t i = 0;
  1925. while (row.size() != 0) {
  1926. auto ind = row["text"];
  1927. std::shared_ptr<Tensor> de_expected_tensor;
  1928. ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor));
  1929. mindspore::MSTensor ms_expected_tensor =
  1930. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  1931. EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor);
  1932. ASSERT_OK(iter->GetNextRow(&row));
  1933. i++;
  1934. }
  1935. EXPECT_EQ(i, 8);
  1936. // Manually terminate the pipeline
  1937. iter->Stop();
  1938. }
  1939. TEST_F(MindDataTestPipeline, TestRegexReplaceSuccess1) {
  1940. // Testing the parameter of RegexReplace interface when the replace_all is false.
  1941. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRegexReplaceSuccess1.";
  1942. // Create a TextFile dataset
  1943. std::string data_file = datasets_root_path_ + "/testTokenizerData/regex_replace.txt";
  1944. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  1945. EXPECT_NE(ds, nullptr);
  1946. // Create regex_replace operation on ds
  1947. std::shared_ptr<TensorTransform> regex_replace = std::make_shared<text::RegexReplace>("\\s+", "_", false);
  1948. EXPECT_NE(regex_replace, nullptr);
  1949. // Create Map operation on ds
  1950. ds = ds->Map({regex_replace}, {"text"});
  1951. EXPECT_NE(ds, nullptr);
  1952. // Create an iterator over the result of the above dataset
  1953. // This will trigger the creation of the Execution Tree and launch it.
  1954. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  1955. EXPECT_NE(iter, nullptr);
  1956. // Iterate the dataset and get each row
  1957. std::unordered_map<std::string, mindspore::MSTensor> row;
  1958. ASSERT_OK(iter->GetNextRow(&row));
  1959. std::vector<std::string> expected = {"Hello_World", "Let's_Go", "1:hello", "2:world",
  1960. "31:beijing", "Welcome_to China!", "_我 不想 长大 ", "Welcome_to Shenzhen!"};
  1961. uint64_t i = 0;
  1962. while (row.size() != 0) {
  1963. auto ind = row["text"];
  1964. std::shared_ptr<Tensor> de_expected_tensor;
  1965. ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor));
  1966. mindspore::MSTensor ms_expected_tensor =
  1967. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  1968. EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor);
  1969. ASSERT_OK(iter->GetNextRow(&row));
  1970. i++;
  1971. }
  1972. EXPECT_EQ(i, 8);
  1973. // Manually terminate the pipeline
  1974. iter->Stop();
  1975. }
  1976. TEST_F(MindDataTestPipeline, TestRegexTokenizerSuccess) {
  1977. // Testing the parameter of RegexTokenizer interface when the with_offsets is false.
  1978. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRegexTokenizerSuccess.";
  1979. // Create a TextFile dataset
  1980. std::string data_file = datasets_root_path_ + "/testTokenizerData/regex_replace.txt";
  1981. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  1982. EXPECT_NE(ds, nullptr);
  1983. // Create regex_tokenizer operation on ds
  1984. std::shared_ptr<TensorTransform> regex_tokenizer = std::make_shared<text::RegexTokenizer>("\\s+", "\\s+", false);
  1985. EXPECT_NE(regex_tokenizer, nullptr);
  1986. // Create Map operation on ds
  1987. ds = ds->Map({regex_tokenizer}, {"text"});
  1988. EXPECT_NE(ds, nullptr);
  1989. // Create an iterator over the result of the above dataset
  1990. // This will trigger the creation of the Execution Tree and launch it.
  1991. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  1992. EXPECT_NE(iter, nullptr);
  1993. // Iterate the dataset and get each row
  1994. std::unordered_map<std::string, mindspore::MSTensor> row;
  1995. ASSERT_OK(iter->GetNextRow(&row));
  1996. std::vector<std::vector<std::string>> expected = {{"Hello", " ", "World"},
  1997. {"Let's", " ", "Go"},
  1998. {"1:hello"},
  1999. {"2:world"},
  2000. {"31:beijing"},
  2001. {"Welcome", " ", "to", " ", "China!"},
  2002. {" ", "我", " ", "不想", " ", "长大", " "},
  2003. {"Welcome", " ", "to", " ", "Shenzhen!"}};
  2004. uint64_t i = 0;
  2005. while (row.size() != 0) {
  2006. auto ind = row["text"];
  2007. std::shared_ptr<Tensor> de_expected_tensor;
  2008. int x = expected[i].size();
  2009. ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({x}), &de_expected_tensor));
  2010. mindspore::MSTensor expected_tensor =
  2011. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  2012. EXPECT_MSTENSOR_EQ(ind, expected_tensor);
  2013. ASSERT_OK(iter->GetNextRow(&row));
  2014. i++;
  2015. }
  2016. EXPECT_EQ(i, 8);
  2017. // Manually terminate the pipeline
  2018. iter->Stop();
  2019. }
  2020. TEST_F(MindDataTestPipeline, TestRegexTokenizerSuccess1) {
  2021. // Testing the parameter of RegexTokenizer interface when the with_offsets is true.
  2022. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRegexTokenizerSuccess1.";
  2023. // Create a TextFile dataset
  2024. std::string data_file = datasets_root_path_ + "/testTokenizerData/regex_replace.txt";
  2025. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  2026. EXPECT_NE(ds, nullptr);
  2027. // Create regex_tokenizer operation on ds
  2028. std::shared_ptr<TensorTransform> regex_tokenizer = std::make_shared<text::RegexTokenizer>("\\s+", "\\s+", true);
  2029. EXPECT_NE(regex_tokenizer, nullptr);
  2030. // Create Map operation on ds
  2031. ds = ds->Map({regex_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"},
  2032. {"token", "offsets_start", "offsets_limit"});
  2033. EXPECT_NE(ds, nullptr);
  2034. // Create an iterator over the result of the above dataset
  2035. // This will trigger the creation of the Execution Tree and launch it.
  2036. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  2037. EXPECT_NE(iter, nullptr);
  2038. // Iterate the dataset and get each row
  2039. std::unordered_map<std::string, mindspore::MSTensor> row;
  2040. ASSERT_OK(iter->GetNextRow(&row));
  2041. std::vector<std::vector<std::string>> expected_tokens = {{"Hello", " ", "World"},
  2042. {"Let's", " ", "Go"},
  2043. {"1:hello"},
  2044. {"2:world"},
  2045. {"31:beijing"},
  2046. {"Welcome", " ", "to", " ", "China!"},
  2047. {" ", "我", " ", "不想", " ", "长大", " "},
  2048. {"Welcome", " ", "to", " ", "Shenzhen!"}};
  2049. std::vector<std::vector<uint32_t>> expected_offsets_start = {
  2050. {0, 5, 6}, {0, 5, 6}, {0}, {0}, {0}, {0, 7, 8, 10, 11}, {0, 2, 5, 6, 12, 14, 20}, {0, 7, 8, 10, 11}};
  2051. std::vector<std::vector<uint32_t>> expected_offsets_limit = {
  2052. {5, 6, 11}, {5, 6, 8}, {7}, {7}, {10}, {7, 8, 10, 11, 17}, {2, 5, 6, 12, 14, 20, 21}, {7, 8, 10, 11, 20}};
  2053. uint64_t i = 0;
  2054. while (row.size() != 0) {
  2055. auto token = row["token"];
  2056. auto start = row["offsets_start"];
  2057. auto limit = row["offsets_limit"];
  2058. std::shared_ptr<Tensor> de_expected_tokens;
  2059. int x = expected_tokens[i].size();
  2060. ASSERT_OK(Tensor::CreateFromVector(expected_tokens[i], TensorShape({x}), &de_expected_tokens));
  2061. mindspore::MSTensor ms_expected_tokens =
  2062. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tokens));
  2063. EXPECT_MSTENSOR_EQ(token, ms_expected_tokens);
  2064. std::shared_ptr<Tensor> de_expected_offsets_start;
  2065. ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start[i], TensorShape({x}), &de_expected_offsets_start));
  2066. mindspore::MSTensor ms_expected_offsets_start =
  2067. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_start));
  2068. EXPECT_MSTENSOR_EQ(start, ms_expected_offsets_start);
  2069. std::shared_ptr<Tensor> de_expected_offsets_limit;
  2070. ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit[i], TensorShape({x}), &de_expected_offsets_limit));
  2071. mindspore::MSTensor ms_expected_offsets_limit =
  2072. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_limit));
  2073. EXPECT_MSTENSOR_EQ(limit, ms_expected_offsets_limit);
  2074. ASSERT_OK(iter->GetNextRow(&row));
  2075. i++;
  2076. }
  2077. EXPECT_EQ(i, 8);
  2078. // Manually terminate the pipeline
  2079. iter->Stop();
  2080. }
  2081. TEST_F(MindDataTestPipeline, TestUnicodeCharTokenizerSuccess) {
  2082. // Testing the parameter of UnicodeCharTokenizer interface when the with_offsets is default.
  2083. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestUnicodeCharTokenizerSuccess.";
  2084. // Create a TextFile dataset
  2085. std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt";
  2086. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  2087. EXPECT_NE(ds, nullptr);
  2088. // Create unicodechar_tokenizer operation on ds
  2089. std::shared_ptr<TensorTransform> unicodechar_tokenizer = std::make_shared<text::UnicodeCharTokenizer>();
  2090. EXPECT_NE(unicodechar_tokenizer, nullptr);
  2091. // Create Map operation on ds
  2092. ds = ds->Map({unicodechar_tokenizer}, {"text"});
  2093. EXPECT_NE(ds, nullptr);
  2094. // Create an iterator over the result of the above dataset
  2095. // This will trigger the creation of the Execution Tree and launch it.
  2096. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  2097. EXPECT_NE(iter, nullptr);
  2098. // Iterate the dataset and get each row
  2099. std::unordered_map<std::string, mindspore::MSTensor> row;
  2100. ASSERT_OK(iter->GetNextRow(&row));
  2101. std::vector<std::vector<std::string>> expected = {
  2102. {"W", "e", "l", "c", "o", "m", "e", " ", "t", "o", " ", "B", "e", "i", "j", "i", "n", "g", "!"},
  2103. {"北", "京", "欢", "迎", "您", "!"},
  2104. {"我", "喜", "欢", "E", "n", "g", "l", "i", "s", "h", "!"},
  2105. {" ", " "}};
  2106. uint64_t i = 0;
  2107. while (row.size() != 0) {
  2108. auto ind = row["text"];
  2109. std::shared_ptr<Tensor> de_expected_tensor;
  2110. int x = expected[i].size();
  2111. ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({x}), &de_expected_tensor));
  2112. mindspore::MSTensor expected_tensor =
  2113. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  2114. EXPECT_MSTENSOR_EQ(ind, expected_tensor);
  2115. ASSERT_OK(iter->GetNextRow(&row));
  2116. i++;
  2117. }
  2118. EXPECT_EQ(i, 4);
  2119. // Manually terminate the pipeline
  2120. iter->Stop();
  2121. }
  2122. TEST_F(MindDataTestPipeline, TestUnicodeCharTokenizerSuccess1) {
  2123. // Testing the parameter of UnicodeCharTokenizer interface when the with_offsets is true.
  2124. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestUnicodeCharTokenizerSuccess1.";
  2125. // Create a TextFile dataset
  2126. std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt";
  2127. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  2128. EXPECT_NE(ds, nullptr);
  2129. // Create unicodechar_tokenizer operation on ds
  2130. std::shared_ptr<TensorTransform> unicodechar_tokenizer = std::make_shared<text::UnicodeCharTokenizer>(true);
  2131. EXPECT_NE(unicodechar_tokenizer, nullptr);
  2132. // Create Map operation on ds
  2133. ds = ds->Map({unicodechar_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"},
  2134. {"token", "offsets_start", "offsets_limit"});
  2135. EXPECT_NE(ds, nullptr);
  2136. // Create an iterator over the result of the above dataset
  2137. // This will trigger the creation of the Execution Tree and launch it.
  2138. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  2139. EXPECT_NE(iter, nullptr);
  2140. // Iterate the dataset and get each row
  2141. std::unordered_map<std::string, mindspore::MSTensor> row;
  2142. ASSERT_OK(iter->GetNextRow(&row));
  2143. std::vector<std::vector<std::string>> expected_tokens = {
  2144. {"W", "e", "l", "c", "o", "m", "e", " ", "t", "o", " ", "B", "e", "i", "j", "i", "n", "g", "!"},
  2145. {"北", "京", "欢", "迎", "您", "!"},
  2146. {"我", "喜", "欢", "E", "n", "g", "l", "i", "s", "h", "!"},
  2147. {" ", " "}};
  2148. std::vector<std::vector<uint32_t>> expected_offsets_start = {
  2149. {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18},
  2150. {0, 3, 6, 9, 12, 15},
  2151. {0, 3, 6, 9, 10, 11, 12, 13, 14, 15, 16},
  2152. {0, 1}};
  2153. std::vector<std::vector<uint32_t>> expected_offsets_limit = {
  2154. {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19},
  2155. {3, 6, 9, 12, 15, 18},
  2156. {3, 6, 9, 10, 11, 12, 13, 14, 15, 16, 17},
  2157. {1, 2}};
  2158. uint64_t i = 0;
  2159. while (row.size() != 0) {
  2160. auto token = row["token"];
  2161. auto start = row["offsets_start"];
  2162. auto limit = row["offsets_limit"];
  2163. std::shared_ptr<Tensor> de_expected_tokens;
  2164. int x = expected_tokens[i].size();
  2165. ASSERT_OK(Tensor::CreateFromVector(expected_tokens[i], TensorShape({x}), &de_expected_tokens));
  2166. mindspore::MSTensor ms_expected_tokens =
  2167. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tokens));
  2168. EXPECT_MSTENSOR_EQ(token, ms_expected_tokens);
  2169. std::shared_ptr<Tensor> de_expected_offsets_start;
  2170. ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start[i], TensorShape({x}), &de_expected_offsets_start));
  2171. mindspore::MSTensor ms_expected_offsets_start =
  2172. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_start));
  2173. EXPECT_MSTENSOR_EQ(start, ms_expected_offsets_start);
  2174. std::shared_ptr<Tensor> de_expected_offsets_limit;
  2175. ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit[i], TensorShape({x}), &de_expected_offsets_limit));
  2176. mindspore::MSTensor ms_expected_offsets_limit =
  2177. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_limit));
  2178. EXPECT_MSTENSOR_EQ(limit, ms_expected_offsets_limit);
  2179. ASSERT_OK(iter->GetNextRow(&row));
  2180. i++;
  2181. }
  2182. EXPECT_EQ(i, 4);
  2183. // Manually terminate the pipeline
  2184. iter->Stop();
  2185. }
  2186. std::vector<std::string> vocab_english = {"book", "cholera", "era", "favor", "##ite", "my",
  2187. "is", "love", "dur", "##ing", "the"};
  2188. std::vector<std::string> vocab_chinese = {"我", "最", "喜", "欢", "的", "书", "是", "霍", "乱", "时", "期", "爱", "情"};
  2189. TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess1) {
  2190. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess1.";
  2191. // Test WordpieceTokenizer with default parameters on English vocab
  2192. // Create a TextFile dataset
  2193. std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt";
  2194. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  2195. EXPECT_NE(ds, nullptr);
  2196. // Create Take operation on ds
  2197. ds = ds->Take(10);
  2198. EXPECT_NE(ds, nullptr);
  2199. // Create a vocab from vector
  2200. std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
  2201. Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab);
  2202. EXPECT_EQ(s, Status::OK());
  2203. // Create WordpieceTokenizer operation on ds
  2204. std::shared_ptr<TensorTransform> wordpiece_tokenizer = std::make_shared<text::WordpieceTokenizer>(vocab);
  2205. EXPECT_NE(wordpiece_tokenizer, nullptr);
  2206. // Create Map operation on ds
  2207. ds = ds->Map({wordpiece_tokenizer}, {"text"});
  2208. EXPECT_NE(ds, nullptr);
  2209. // Create an iterator over the result of the above dataset
  2210. // This will trigger the creation of the Execution Tree and launch it.
  2211. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  2212. EXPECT_NE(iter, nullptr);
  2213. // Iterate the dataset and get each row
  2214. std::unordered_map<std::string, mindspore::MSTensor> row;
  2215. ASSERT_OK(iter->GetNextRow(&row));
  2216. std::vector<std::vector<std::string>> expected = {
  2217. {"my"}, {"favor", "##ite"}, {"book"}, {"is"}, {"love"}, {"dur", "##ing"}, {"the"}, {"cholera"}, {"era"}, {"[UNK]"}};
  2218. uint64_t i = 0;
  2219. while (row.size() != 0) {
  2220. auto txt = row["text"];
  2221. std::shared_ptr<Tensor> de_expected_tensor;
  2222. ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
  2223. mindspore::MSTensor expected_tensor =
  2224. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  2225. EXPECT_MSTENSOR_EQ(txt, expected_tensor);
  2226. ASSERT_OK(iter->GetNextRow(&row));
  2227. i++;
  2228. }
  2229. EXPECT_EQ(i, 10);
  2230. // Manually terminate the pipeline
  2231. iter->Stop();
  2232. }
  2233. TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess2) {
  2234. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess2.";
  2235. // Test WordpieceTokenizer with empty unknown_token
  2236. // Create a TextFile dataset
  2237. std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt";
  2238. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  2239. EXPECT_NE(ds, nullptr);
  2240. // Create Take operation on ds
  2241. ds = ds->Take(10);
  2242. EXPECT_NE(ds, nullptr);
  2243. // Create a vocab from vector
  2244. std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
  2245. Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab);
  2246. EXPECT_EQ(s, Status::OK());
  2247. // Create WordpieceTokenizer operation on ds
  2248. std::shared_ptr<TensorTransform> wordpiece_tokenizer =
  2249. std::make_shared<text::WordpieceTokenizer>(vocab, "##", 100, "", false);
  2250. EXPECT_NE(wordpiece_tokenizer, nullptr);
  2251. // Create Map operation on ds
  2252. ds = ds->Map({wordpiece_tokenizer}, {"text"});
  2253. EXPECT_NE(ds, nullptr);
  2254. // Create an iterator over the result of the above dataset
  2255. // This will trigger the creation of the Execution Tree and launch it.
  2256. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  2257. EXPECT_NE(iter, nullptr);
  2258. // Iterate the dataset and get each row
  2259. std::unordered_map<std::string, mindspore::MSTensor> row;
  2260. ASSERT_OK(iter->GetNextRow(&row));
  2261. std::vector<std::vector<std::string>> expected = {
  2262. {"my"}, {"favor", "##ite"}, {"book"}, {"is"}, {"love"}, {"dur", "##ing"}, {"the"}, {"cholera"}, {"era"}, {"what"}};
  2263. uint64_t i = 0;
  2264. while (row.size() != 0) {
  2265. auto txt = row["text"];
  2266. std::shared_ptr<Tensor> de_expected_tensor;
  2267. ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
  2268. mindspore::MSTensor expected_tensor =
  2269. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  2270. EXPECT_MSTENSOR_EQ(txt, expected_tensor);
  2271. ASSERT_OK(iter->GetNextRow(&row));
  2272. i++;
  2273. }
  2274. EXPECT_EQ(i, 10);
  2275. // Manually terminate the pipeline
  2276. iter->Stop();
  2277. }
  2278. TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess3) {
  2279. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess3.";
  2280. // Test WordpieceTokenizer with non-default max_bytes_per_token
  2281. // Create a TextFile dataset
  2282. std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt";
  2283. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  2284. EXPECT_NE(ds, nullptr);
  2285. // Create Take operation on ds
  2286. ds = ds->Take(10);
  2287. EXPECT_NE(ds, nullptr);
  2288. // Create a vocab from vector
  2289. std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
  2290. Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab);
  2291. EXPECT_EQ(s, Status::OK());
  2292. // Create WordpieceTokenizer operation on ds
  2293. std::shared_ptr<TensorTransform> wordpiece_tokenizer =
  2294. std::make_shared<text::WordpieceTokenizer>(vocab, "##", 4, "[UNK]", false);
  2295. EXPECT_NE(wordpiece_tokenizer, nullptr);
  2296. // Create Map operation on ds
  2297. ds = ds->Map({wordpiece_tokenizer}, {"text"});
  2298. EXPECT_NE(ds, nullptr);
  2299. // Create an iterator over the result of the above dataset
  2300. // This will trigger the creation of the Execution Tree and launch it.
  2301. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  2302. EXPECT_NE(iter, nullptr);
  2303. // Iterate the dataset and get each row
  2304. std::unordered_map<std::string, mindspore::MSTensor> row;
  2305. ASSERT_OK(iter->GetNextRow(&row));
  2306. std::vector<std::vector<std::string>> expected = {{"my"}, {"[UNK]"}, {"book"}, {"is"}, {"love"},
  2307. {"[UNK]"}, {"the"}, {"[UNK]"}, {"era"}, {"[UNK]"}};
  2308. uint64_t i = 0;
  2309. while (row.size() != 0) {
  2310. auto txt = row["text"];
  2311. std::shared_ptr<Tensor> de_expected_tensor;
  2312. ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
  2313. mindspore::MSTensor expected_tensor =
  2314. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  2315. EXPECT_MSTENSOR_EQ(txt, expected_tensor);
  2316. ASSERT_OK(iter->GetNextRow(&row));
  2317. i++;
  2318. }
  2319. EXPECT_EQ(i, 10);
  2320. // Manually terminate the pipeline
  2321. iter->Stop();
  2322. }
  2323. TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess4) {
  2324. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess4.";
  2325. // Test WordpieceTokenizer with default parameters on Chinese vocab
  2326. // Create a TextFile dataset
  2327. std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt";
  2328. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  2329. EXPECT_NE(ds, nullptr);
  2330. // Create Skip operation on ds
  2331. ds = ds->Skip(10);
  2332. EXPECT_NE(ds, nullptr);
  2333. // Create Take operation on ds
  2334. ds = ds->Take(15);
  2335. EXPECT_NE(ds, nullptr);
  2336. // Create a vocab from vector
  2337. std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
  2338. Status s = Vocab::BuildFromVector(vocab_chinese, {}, true, &vocab);
  2339. EXPECT_EQ(s, Status::OK());
  2340. // Create WordpieceTokenizer operation on ds
  2341. std::shared_ptr<TensorTransform> wordpiece_tokenizer =
  2342. std::make_shared<text::WordpieceTokenizer>(vocab, "##", 100, "[UNK]", false);
  2343. EXPECT_NE(wordpiece_tokenizer, nullptr);
  2344. // Create Map operation on ds
  2345. ds = ds->Map({wordpiece_tokenizer}, {"text"});
  2346. EXPECT_NE(ds, nullptr);
  2347. // Create an iterator over the result of the above dataset
  2348. // This will trigger the creation of the Execution Tree and launch it.
  2349. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  2350. EXPECT_NE(iter, nullptr);
  2351. // Iterate the dataset and get each row
  2352. std::unordered_map<std::string, mindspore::MSTensor> row;
  2353. ASSERT_OK(iter->GetNextRow(&row));
  2354. std::vector<std::vector<std::string>> expected = {{"我"}, {"最"}, {"喜"}, {"欢"}, {"的"}, {"书"}, {"是"}, {"霍"},
  2355. {"乱"}, {"时"}, {"期"}, {"的"}, {"爱"}, {"情"}, {"[UNK]"}};
  2356. uint64_t i = 0;
  2357. while (row.size() != 0) {
  2358. auto txt = row["text"];
  2359. std::shared_ptr<Tensor> de_expected_tensor;
  2360. ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
  2361. mindspore::MSTensor expected_tensor =
  2362. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  2363. EXPECT_MSTENSOR_EQ(txt, expected_tensor);
  2364. ASSERT_OK(iter->GetNextRow(&row));
  2365. i++;
  2366. }
  2367. EXPECT_EQ(i, 15);
  2368. // Manually terminate the pipeline
  2369. iter->Stop();
  2370. }
  2371. TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess5) {
  2372. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess5.";
  2373. // Test WordpieceTokenizer with with_offsets true
  2374. // Create a TextFile dataset
  2375. std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt";
  2376. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  2377. EXPECT_NE(ds, nullptr);
  2378. // Create Take operation on ds
  2379. ds = ds->Take(10);
  2380. EXPECT_NE(ds, nullptr);
  2381. // Create a vocab from vector
  2382. std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
  2383. Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab);
  2384. EXPECT_EQ(s, Status::OK());
  2385. // Create WordpieceTokenizer operation on ds
  2386. std::shared_ptr<TensorTransform> wordpiece_tokenizer =
  2387. std::make_shared<text::WordpieceTokenizer>(vocab, "##", 100, "[UNK]", true);
  2388. EXPECT_NE(wordpiece_tokenizer, nullptr);
  2389. // Create Map operation on ds
  2390. ds = ds->Map({wordpiece_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"});
  2391. EXPECT_NE(ds, nullptr);
  2392. // Create an iterator over the result of the above dataset
  2393. // This will trigger the creation of the Execution Tree and launch it.
  2394. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  2395. EXPECT_NE(iter, nullptr);
  2396. // Iterate the dataset and get each row
  2397. std::unordered_map<std::string, mindspore::MSTensor> row;
  2398. ASSERT_OK(iter->GetNextRow(&row));
  2399. std::vector<std::vector<std::string>> expected = {
  2400. {"my"}, {"favor", "##ite"}, {"book"}, {"is"}, {"love"}, {"dur", "##ing"}, {"the"}, {"cholera"}, {"era"}, {"[UNK]"}};
  2401. std::vector<std::vector<uint32_t>> expected_offsets_start = {{0}, {0, 5}, {0}, {0}, {0}, {0, 3}, {0}, {0}, {0}, {0}};
  2402. std::vector<std::vector<uint32_t>> expected_offsets_limit = {{2}, {5, 8}, {4}, {2}, {4}, {3, 6}, {3}, {7}, {3}, {4}};
  2403. uint64_t i = 0;
  2404. while (row.size() != 0) {
  2405. auto txt = row["token"];
  2406. std::shared_ptr<Tensor> de_expected_tensor;
  2407. ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
  2408. mindspore::MSTensor expected_tensor =
  2409. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  2410. EXPECT_MSTENSOR_EQ(txt, expected_tensor);
  2411. auto start = row["offsets_start"];
  2412. std::shared_ptr<Tensor> de_expected_start_tensor;
  2413. ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start[i], &de_expected_start_tensor));
  2414. mindspore::MSTensor expected_start_tensor =
  2415. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_start_tensor));
  2416. EXPECT_MSTENSOR_EQ(start, expected_start_tensor);
  2417. auto limit = row["offsets_limit"];
  2418. std::shared_ptr<Tensor> de_expected_limit_tensor;
  2419. ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit[i], &de_expected_limit_tensor));
  2420. mindspore::MSTensor expected_limit_tensor =
  2421. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_limit_tensor));
  2422. EXPECT_MSTENSOR_EQ(limit, expected_limit_tensor);
  2423. ASSERT_OK(iter->GetNextRow(&row));
  2424. i++;
  2425. }
  2426. EXPECT_EQ(i, 10);
  2427. // Manually terminate the pipeline
  2428. iter->Stop();
  2429. }
  2430. TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess6) {
  2431. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess6.";
  2432. // Test WordpieceTokenizer with max_bytes_per_token equals to 0
  2433. // Create a TextFile dataset
  2434. std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt";
  2435. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  2436. EXPECT_NE(ds, nullptr);
  2437. // Create Take operation on ds
  2438. ds = ds->Take(10);
  2439. EXPECT_NE(ds, nullptr);
  2440. // Create a vocab from vector
  2441. std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
  2442. Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab);
  2443. EXPECT_EQ(s, Status::OK());
  2444. // Create WordpieceTokenizer operation on ds
  2445. std::shared_ptr<TensorTransform> wordpiece_tokenizer =
  2446. std::make_shared<text::WordpieceTokenizer>(vocab, "##", 0, "[UNK]", true);
  2447. EXPECT_NE(wordpiece_tokenizer, nullptr);
  2448. // Create Map operation on ds
  2449. ds = ds->Map({wordpiece_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"});
  2450. EXPECT_NE(ds, nullptr);
  2451. // Create an iterator over the result of the above dataset
  2452. // This will trigger the creation of the Execution Tree and launch it.
  2453. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  2454. EXPECT_NE(iter, nullptr);
  2455. // Iterate the dataset and get each row
  2456. std::unordered_map<std::string, mindspore::MSTensor> row;
  2457. ASSERT_OK(iter->GetNextRow(&row));
  2458. std::vector<std::vector<std::string>> expected = {{"[UNK]"}, {"[UNK]"}, {"[UNK]"}, {"[UNK]"}, {"[UNK]"},
  2459. {"[UNK]"}, {"[UNK]"}, {"[UNK]"}, {"[UNK]"}, {"[UNK]"}};
  2460. uint64_t i = 0;
  2461. while (row.size() != 0) {
  2462. auto txt = row["token"];
  2463. std::shared_ptr<Tensor> de_expected_tensor;
  2464. ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
  2465. mindspore::MSTensor expected_tensor =
  2466. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  2467. EXPECT_MSTENSOR_EQ(txt, expected_tensor);
  2468. ASSERT_OK(iter->GetNextRow(&row));
  2469. i++;
  2470. }
  2471. EXPECT_EQ(i, 10);
  2472. // Manually terminate the pipeline
  2473. iter->Stop();
  2474. }
  2475. TEST_F(MindDataTestPipeline, TestWordpieceTokenizerFail1) {
  2476. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerFail1.";
  2477. // Test WordpieceTokenizer with nullptr vocab
  2478. // Create a TextFile dataset
  2479. std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
  2480. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  2481. EXPECT_NE(ds, nullptr);
  2482. // Create WordpieceTokenizer operation on ds
  2483. std::shared_ptr<TensorTransform> wordpiece_tokenizer = std::make_shared<text::WordpieceTokenizer>(nullptr);
  2484. EXPECT_NE(wordpiece_tokenizer, nullptr);
  2485. // Create a Map operation on ds
  2486. ds = ds->Map({wordpiece_tokenizer});
  2487. EXPECT_NE(ds, nullptr);
  2488. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  2489. // Expect failure: invalid WordpieceTokenizer input with nullptr vocab
  2490. EXPECT_EQ(iter, nullptr);
  2491. }
  2492. TEST_F(MindDataTestPipeline, TestWordpieceTokenizerFail2) {
  2493. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerFail2.";
  2494. // Test WordpieceTokenizer with negative max_bytes_per_token
  2495. // Create a TextFile dataset
  2496. std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
  2497. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  2498. EXPECT_NE(ds, nullptr);
  2499. // Create a vocab from vector
  2500. std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
  2501. Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab);
  2502. EXPECT_EQ(s, Status::OK());
  2503. // Create WordpieceTokenizer operation on ds
  2504. std::shared_ptr<TensorTransform> wordpiece_tokenizer = std::make_shared<text::WordpieceTokenizer>(vocab, "##", -1);
  2505. EXPECT_NE(wordpiece_tokenizer, nullptr);
  2506. // Create a Map operation on ds
  2507. ds = ds->Map({wordpiece_tokenizer});
  2508. EXPECT_NE(ds, nullptr);
  2509. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  2510. // Expect failure: invalid WordpieceTokenizer input with nullptr vocab
  2511. EXPECT_EQ(iter, nullptr);
  2512. }
  2513. TEST_F(MindDataTestPipeline, TestUnicodeScriptTokenizerSuccess) {
  2514. // Testing the parameter of UnicodeScriptTokenizer interface when the with_offsets and the keep_whitespace is default.
  2515. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestUnicodeScriptTokenizerSuccess.";
  2516. // Create a TextFile dataset
  2517. std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt";
  2518. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  2519. EXPECT_NE(ds, nullptr);
  2520. // Create unicodescript_tokenizer operation on ds
  2521. std::shared_ptr<TensorTransform> unicodescript_tokenizer = std::make_shared<text::UnicodeScriptTokenizer>();
  2522. EXPECT_NE(unicodescript_tokenizer, nullptr);
  2523. // Create Map operation on ds
  2524. ds = ds->Map({unicodescript_tokenizer}, {"text"});
  2525. EXPECT_NE(ds, nullptr);
  2526. // Create an iterator over the result of the above dataset
  2527. // This will trigger the creation of the Execution Tree and launch it.
  2528. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  2529. EXPECT_NE(iter, nullptr);
  2530. // Iterate the dataset and get each row
  2531. std::unordered_map<std::string, mindspore::MSTensor> row;
  2532. ASSERT_OK(iter->GetNextRow(&row));
  2533. std::vector<std::vector<std::string>> expected = {
  2534. {"Welcome", "to", "Beijing", "!"}, {"北京欢迎您", "!"}, {"我喜欢", "English", "!"}, {""}};
  2535. uint64_t i = 0;
  2536. while (row.size() != 0) {
  2537. auto ind = row["text"];
  2538. std::shared_ptr<Tensor> de_expected_tensor;
  2539. int x = expected[i].size();
  2540. ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({x}), &de_expected_tensor));
  2541. mindspore::MSTensor expected_tensor =
  2542. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  2543. EXPECT_MSTENSOR_EQ(ind, expected_tensor);
  2544. ASSERT_OK(iter->GetNextRow(&row));
  2545. i++;
  2546. }
  2547. EXPECT_EQ(i, 4);
  2548. // Manually terminate the pipeline
  2549. iter->Stop();
  2550. }
  2551. TEST_F(MindDataTestPipeline, TestUnicodeScriptTokenizerSuccess1) {
  2552. // Testing the parameter of UnicodeScriptTokenizer interface when the keep_whitespace is true and the with_offsets is
  2553. // false.
  2554. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestUnicodeScriptTokenizerSuccess1.";
  2555. // Create a TextFile dataset
  2556. std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt";
  2557. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  2558. EXPECT_NE(ds, nullptr);
  2559. // Create unicodescript_tokenizer operation on ds
  2560. std::shared_ptr<TensorTransform> unicodescript_tokenizer = std::make_shared<text::UnicodeScriptTokenizer>(true);
  2561. EXPECT_NE(unicodescript_tokenizer, nullptr);
  2562. // Create Map operation on ds
  2563. ds = ds->Map({unicodescript_tokenizer}, {"text"});
  2564. EXPECT_NE(ds, nullptr);
  2565. // Create an iterator over the result of the above dataset
  2566. // This will trigger the creation of the Execution Tree and launch it.
  2567. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  2568. EXPECT_NE(iter, nullptr);
  2569. // Iterate the dataset and get each row
  2570. std::unordered_map<std::string, mindspore::MSTensor> row;
  2571. ASSERT_OK(iter->GetNextRow(&row));
  2572. std::vector<std::vector<std::string>> expected = {
  2573. {"Welcome", " ", "to", " ", "Beijing", "!"}, {"北京欢迎您", "!"}, {"我喜欢", "English", "!"}, {" "}};
  2574. uint64_t i = 0;
  2575. while (row.size() != 0) {
  2576. auto ind = row["text"];
  2577. std::shared_ptr<Tensor> de_expected_tensor;
  2578. int x = expected[i].size();
  2579. ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({x}), &de_expected_tensor));
  2580. mindspore::MSTensor expected_tensor =
  2581. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  2582. EXPECT_MSTENSOR_EQ(ind, expected_tensor);
  2583. ASSERT_OK(iter->GetNextRow(&row));
  2584. i++;
  2585. }
  2586. EXPECT_EQ(i, 4);
  2587. // Manually terminate the pipeline
  2588. iter->Stop();
  2589. }
  2590. TEST_F(MindDataTestPipeline, TestUnicodeScriptTokenizerSuccess2) {
  2591. // Testing the parameter of UnicodeScriptTokenizer interface when the keep_whitespace is false and the with_offsets is
  2592. // true.
  2593. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestUnicodeScriptTokenizerSuccess2.";
  2594. // Create a TextFile dataset
  2595. std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt";
  2596. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  2597. EXPECT_NE(ds, nullptr);
  2598. // Create unicodescript_tokenizer operation on ds
  2599. std::shared_ptr<TensorTransform> unicodescript_tokenizer =
  2600. std::make_shared<text::UnicodeScriptTokenizer>(false, true);
  2601. EXPECT_NE(unicodescript_tokenizer, nullptr);
  2602. // Create Map operation on ds
  2603. ds = ds->Map({unicodescript_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"},
  2604. {"token", "offsets_start", "offsets_limit"});
  2605. EXPECT_NE(ds, nullptr);
  2606. // Create an iterator over the result of the above dataset
  2607. // This will trigger the creation of the Execution Tree and launch it.
  2608. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  2609. EXPECT_NE(iter, nullptr);
  2610. // Iterate the dataset and get each row
  2611. std::unordered_map<std::string, mindspore::MSTensor> row;
  2612. ASSERT_OK(iter->GetNextRow(&row));
  2613. std::vector<std::vector<std::string>> expected_tokens = {
  2614. {"Welcome", "to", "Beijing", "!"}, {"北京欢迎您", "!"}, {"我喜欢", "English", "!"}, {""}};
  2615. std::vector<std::vector<uint32_t>> expected_offsets_start = {{0, 8, 11, 18}, {0, 15}, {0, 9, 16}, {0}};
  2616. std::vector<std::vector<uint32_t>> expected_offsets_limit = {{7, 10, 18, 19}, {15, 18}, {9, 16, 17}, {0}};
  2617. uint64_t i = 0;
  2618. while (row.size() != 0) {
  2619. auto token = row["token"];
  2620. auto start = row["offsets_start"];
  2621. auto limit = row["offsets_limit"];
  2622. std::shared_ptr<Tensor> de_expected_tokens;
  2623. int x = expected_tokens[i].size();
  2624. ASSERT_OK(Tensor::CreateFromVector(expected_tokens[i], TensorShape({x}), &de_expected_tokens));
  2625. mindspore::MSTensor ms_expected_tokens =
  2626. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tokens));
  2627. EXPECT_MSTENSOR_EQ(token, ms_expected_tokens);
  2628. std::shared_ptr<Tensor> de_expected_offsets_start;
  2629. ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start[i], TensorShape({x}), &de_expected_offsets_start));
  2630. mindspore::MSTensor ms_expected_offsets_start =
  2631. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_start));
  2632. EXPECT_MSTENSOR_EQ(start, ms_expected_offsets_start);
  2633. std::shared_ptr<Tensor> de_expected_offsets_limit;
  2634. ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit[i], TensorShape({x}), &de_expected_offsets_limit));
  2635. mindspore::MSTensor ms_expected_offsets_limit =
  2636. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_limit));
  2637. EXPECT_MSTENSOR_EQ(limit, ms_expected_offsets_limit);
  2638. ASSERT_OK(iter->GetNextRow(&row));
  2639. i++;
  2640. }
  2641. EXPECT_EQ(i, 4);
  2642. // Manually terminate the pipeline
  2643. iter->Stop();
  2644. }
  2645. TEST_F(MindDataTestPipeline, TestUnicodeScriptTokenizerSuccess3) {
  2646. // Testing the parameter of UnicodeScriptTokenizer interface when the keep_whitespace is true and the with_offsets is
  2647. // true.
  2648. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestUnicodeScriptTokenizerSuccess3.";
  2649. // Create a TextFile dataset
  2650. std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt";
  2651. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  2652. EXPECT_NE(ds, nullptr);
  2653. // Create unicodescript_tokenizer operation on ds
  2654. std::shared_ptr<TensorTransform> unicodescript_tokenizer = std::make_shared<text::UnicodeScriptTokenizer>(true, true);
  2655. EXPECT_NE(unicodescript_tokenizer, nullptr);
  2656. // Create Map operation on ds
  2657. ds = ds->Map({unicodescript_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"},
  2658. {"token", "offsets_start", "offsets_limit"});
  2659. EXPECT_NE(ds, nullptr);
  2660. // Create an iterator over the result of the above dataset
  2661. // This will trigger the creation of the Execution Tree and launch it.
  2662. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  2663. EXPECT_NE(iter, nullptr);
  2664. // Iterate the dataset and get each row
  2665. std::unordered_map<std::string, mindspore::MSTensor> row;
  2666. ASSERT_OK(iter->GetNextRow(&row));
  2667. std::vector<std::vector<std::string>> expected_tokens = {
  2668. {"Welcome", " ", "to", " ", "Beijing", "!"}, {"北京欢迎您", "!"}, {"我喜欢", "English", "!"}, {" "}};
  2669. std::vector<std::vector<uint32_t>> expected_offsets_start = {{0, 7, 8, 10, 11, 18}, {0, 15}, {0, 9, 16}, {0}};
  2670. std::vector<std::vector<uint32_t>> expected_offsets_limit = {{7, 8, 10, 11, 18, 19}, {15, 18}, {9, 16, 17}, {2}};
  2671. uint64_t i = 0;
  2672. while (row.size() != 0) {
  2673. auto token = row["token"];
  2674. auto start = row["offsets_start"];
  2675. auto limit = row["offsets_limit"];
  2676. std::shared_ptr<Tensor> de_expected_tokens;
  2677. int x = expected_tokens[i].size();
  2678. ASSERT_OK(Tensor::CreateFromVector(expected_tokens[i], TensorShape({x}), &de_expected_tokens));
  2679. mindspore::MSTensor ms_expected_tokens =
  2680. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tokens));
  2681. EXPECT_MSTENSOR_EQ(token, ms_expected_tokens);
  2682. std::shared_ptr<Tensor> de_expected_offsets_start;
  2683. ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start[i], TensorShape({x}), &de_expected_offsets_start));
  2684. mindspore::MSTensor ms_expected_offsets_start =
  2685. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_start));
  2686. EXPECT_MSTENSOR_EQ(start, ms_expected_offsets_start);
  2687. std::shared_ptr<Tensor> de_expected_offsets_limit;
  2688. ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit[i], TensorShape({x}), &de_expected_offsets_limit));
  2689. mindspore::MSTensor ms_expected_offsets_limit =
  2690. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_limit));
  2691. EXPECT_MSTENSOR_EQ(limit, ms_expected_offsets_limit);
  2692. ASSERT_OK(iter->GetNextRow(&row));
  2693. i++;
  2694. }
  2695. EXPECT_EQ(i, 4);
  2696. // Manually terminate the pipeline
  2697. iter->Stop();
  2698. }
  2699. TEST_F(MindDataTestPipeline, TestWhitespaceTokenizerSuccess) {
  2700. // Testing the parameter of WhitespaceTokenizer interface when the with_offsets is default.
  2701. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWhitespaceTokenizerSuccess.";
  2702. // Create a TextFile dataset
  2703. std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
  2704. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  2705. EXPECT_NE(ds, nullptr);
  2706. // Create white_tokenizer operation on ds
  2707. std::shared_ptr<TensorTransform> white_tokenizer = std::make_shared<text::WhitespaceTokenizer>();
  2708. EXPECT_NE(white_tokenizer, nullptr);
  2709. // Create Map operation on ds
  2710. ds = ds->Map({white_tokenizer}, {"text"});
  2711. EXPECT_NE(ds, nullptr);
  2712. // Create an iterator over the result of the above dataset
  2713. // This will trigger the creation of the Execution Tree and launch it.
  2714. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  2715. EXPECT_NE(iter, nullptr);
  2716. // Iterate the dataset and get each row
  2717. std::unordered_map<std::string, mindspore::MSTensor> row;
  2718. ASSERT_OK(iter->GetNextRow(&row));
  2719. std::vector<std::vector<std::string>> expected = {
  2720. {"This", "is", "a", "text", "file."}, {"Be", "happy", "every", "day."}, {"Good", "luck", "to", "everyone."}};
  2721. uint64_t i = 0;
  2722. while (row.size() != 0) {
  2723. auto ind = row["text"];
  2724. std::shared_ptr<Tensor> de_expected_tensor;
  2725. int x = expected[i].size();
  2726. ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({x}), &de_expected_tensor));
  2727. mindspore::MSTensor expected_tensor =
  2728. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  2729. EXPECT_MSTENSOR_EQ(ind, expected_tensor);
  2730. ASSERT_OK(iter->GetNextRow(&row));
  2731. i++;
  2732. }
  2733. EXPECT_EQ(i, 3);
  2734. // Manually terminate the pipeline
  2735. iter->Stop();
  2736. }
  2737. TEST_F(MindDataTestPipeline, TestWhitespaceTokenizerSuccess1) {
  2738. // Testing the parameter of WhitespaceTokenizer interface when the with_offsets is true.
  2739. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWhitespaceTokenizerSuccess1.";
  2740. // Create a TextFile dataset
  2741. std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt";
  2742. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  2743. EXPECT_NE(ds, nullptr);
  2744. // Create white_tokenizer operation on ds
  2745. std::shared_ptr<TensorTransform> white_tokenizer = std::make_shared<text::WhitespaceTokenizer>(true);
  2746. EXPECT_NE(white_tokenizer, nullptr);
  2747. // Create Map operation on ds
  2748. ds = ds->Map({white_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"},
  2749. {"token", "offsets_start", "offsets_limit"});
  2750. EXPECT_NE(ds, nullptr);
  2751. // Create an iterator over the result of the above dataset
  2752. // This will trigger the creation of the Execution Tree and launch it.
  2753. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  2754. EXPECT_NE(iter, nullptr);
  2755. // Iterate the dataset and get each row
  2756. std::unordered_map<std::string, mindspore::MSTensor> row;
  2757. ASSERT_OK(iter->GetNextRow(&row));
  2758. std::vector<std::vector<std::string>> expected_tokens = {
  2759. {"Welcome", "to", "Beijing!"}, {"北京欢迎您!"}, {"我喜欢English!"}, {""}};
  2760. std::vector<std::vector<uint32_t>> expected_offsets_start = {{0, 8, 11}, {0}, {0}, {0}};
  2761. std::vector<std::vector<uint32_t>> expected_offsets_limit = {{7, 10, 19}, {18}, {17}, {0}};
  2762. uint64_t i = 0;
  2763. while (row.size() != 0) {
  2764. auto token = row["token"];
  2765. auto start = row["offsets_start"];
  2766. auto limit = row["offsets_limit"];
  2767. std::shared_ptr<Tensor> de_expected_tokens;
  2768. int x = expected_tokens[i].size();
  2769. ASSERT_OK(Tensor::CreateFromVector(expected_tokens[i], TensorShape({x}), &de_expected_tokens));
  2770. mindspore::MSTensor ms_expected_tokens =
  2771. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tokens));
  2772. EXPECT_MSTENSOR_EQ(token, ms_expected_tokens);
  2773. std::shared_ptr<Tensor> de_expected_offsets_start;
  2774. ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start[i], TensorShape({x}), &de_expected_offsets_start));
  2775. mindspore::MSTensor ms_expected_offsets_start =
  2776. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_start));
  2777. EXPECT_MSTENSOR_EQ(start, ms_expected_offsets_start);
  2778. std::shared_ptr<Tensor> de_expected_offsets_limit;
  2779. ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit[i], TensorShape({x}), &de_expected_offsets_limit));
  2780. mindspore::MSTensor ms_expected_offsets_limit =
  2781. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_limit));
  2782. EXPECT_MSTENSOR_EQ(limit, ms_expected_offsets_limit);
  2783. ASSERT_OK(iter->GetNextRow(&row));
  2784. i++;
  2785. }
  2786. EXPECT_EQ(i, 4);
  2787. // Manually terminate the pipeline
  2788. iter->Stop();
  2789. }
  2790. /// Feature: Vectors
  2791. /// Description: test with default parameter in function BuildFromFile and function Lookup
  2792. /// Expectation: return correct MSTensor which is equal to the expected
  2793. TEST_F(MindDataTestPipeline, TestVectorsDefaultParam) {
  2794. // Test with default parameter.
  2795. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsDefaultParam.";
  2796. // Create a TextFile dataset
  2797. std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
  2798. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  2799. EXPECT_NE(ds, nullptr);
  2800. std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors.txt";
  2801. std::shared_ptr<Vectors> vectors;
  2802. Status s = Vectors::BuildFromFile(&vectors, vectors_dir);
  2803. EXPECT_EQ(s, Status::OK());
  2804. std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(vectors);
  2805. EXPECT_NE(lookup, nullptr);
  2806. // Create Map operation on ds
  2807. ds = ds->Map({lookup}, {"text"});
  2808. EXPECT_NE(ds, nullptr);
  2809. // Create an iterator over the result of the above dataset
  2810. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  2811. EXPECT_NE(iter, nullptr);
  2812. // Iterate the dataset and get each row
  2813. std::unordered_map<std::string, mindspore::MSTensor> row;
  2814. ASSERT_OK(iter->GetNextRow(&row));
  2815. uint64_t i = 0;
  2816. std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
  2817. {0, 0, 0, 0, 0, 0},
  2818. {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
  2819. {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
  2820. {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
  2821. {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
  2822. {0, 0, 0, 0, 0, 0}};
  2823. while (row.size() != 0) {
  2824. auto ind = row["text"];
  2825. MS_LOG(INFO) << ind.Shape();
  2826. TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
  2827. TensorPtr de_expected_item;
  2828. dsize_t dim = 6;
  2829. ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
  2830. mindspore::MSTensor ms_expected_item =
  2831. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
  2832. EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
  2833. ASSERT_OK(iter->GetNextRow(&row));
  2834. i++;
  2835. }
  2836. EXPECT_EQ(i, 7);
  2837. // Manually terminate the pipeline
  2838. iter->Stop();
  2839. }
  2840. /// Feature: Vectors
  2841. /// Description: test with all parameters which include `path` and `max_vector` in function BuildFromFile
  2842. /// Expectation: return correct MSTensor which is equal to the expected
  2843. TEST_F(MindDataTestPipeline, TestVectorsAllBuildfromfileParams) {
  2844. // Test with two parameters.
  2845. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsAllBuildfromfileParams.";
  2846. // Create a TextFile dataset
  2847. std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
  2848. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  2849. EXPECT_NE(ds, nullptr);
  2850. std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors.txt";
  2851. std::shared_ptr<Vectors> vectors;
  2852. Status s = Vectors::BuildFromFile(&vectors, vectors_dir, 100);
  2853. EXPECT_EQ(s, Status::OK());
  2854. std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(vectors);
  2855. EXPECT_NE(lookup, nullptr);
  2856. // Create Map operation on ds
  2857. ds = ds->Map({lookup}, {"text"});
  2858. EXPECT_NE(ds, nullptr);
  2859. // Create an iterator over the result of the above dataset
  2860. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  2861. EXPECT_NE(iter, nullptr);
  2862. // Iterate the dataset and get each row
  2863. std::unordered_map<std::string, mindspore::MSTensor> row;
  2864. ASSERT_OK(iter->GetNextRow(&row));
  2865. uint64_t i = 0;
  2866. std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
  2867. {0, 0, 0, 0, 0, 0},
  2868. {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
  2869. {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
  2870. {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
  2871. {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
  2872. {0, 0, 0, 0, 0, 0}};
  2873. while (row.size() != 0) {
  2874. auto ind = row["text"];
  2875. MS_LOG(INFO) << ind.Shape();
  2876. TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
  2877. TensorPtr de_expected_item;
  2878. dsize_t dim = 6;
  2879. ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
  2880. mindspore::MSTensor ms_expected_item =
  2881. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
  2882. EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
  2883. ASSERT_OK(iter->GetNextRow(&row));
  2884. i++;
  2885. }
  2886. EXPECT_EQ(i, 7);
  2887. // Manually terminate the pipeline
  2888. iter->Stop();
  2889. }
  2890. /// Feature: Vectors
  2891. /// Description: test with all parameters in function BuildFromFile and `unknown_init` in function Lookup
  2892. /// Expectation: return correct MSTensor which is equal to the expected
  2893. TEST_F(MindDataTestPipeline, TestVectorsUnknownInit) {
  2894. // Test with two parameters.
  2895. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsUnknownInit.";
  2896. // Create a TextFile dataset
  2897. std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
  2898. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  2899. EXPECT_NE(ds, nullptr);
  2900. std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors.txt";
  2901. std::shared_ptr<Vectors> vectors;
  2902. Status s = Vectors::BuildFromFile(&vectors, vectors_dir, 100);
  2903. EXPECT_EQ(s, Status::OK());
  2904. std::vector<float> unknown_init = {-1, -1, -1, -1, -1, -1};
  2905. std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(vectors, unknown_init);
  2906. EXPECT_NE(lookup, nullptr);
  2907. // Create Map operation on ds
  2908. ds = ds->Map({lookup}, {"text"});
  2909. EXPECT_NE(ds, nullptr);
  2910. // Create an iterator over the result of the above dataset
  2911. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  2912. EXPECT_NE(iter, nullptr);
  2913. // Iterate the dataset and get each row
  2914. std::unordered_map<std::string, mindspore::MSTensor> row;
  2915. ASSERT_OK(iter->GetNextRow(&row));
  2916. uint64_t i = 0;
  2917. std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
  2918. {-1, -1, -1, -1, -1, -1},
  2919. {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
  2920. {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
  2921. {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
  2922. {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
  2923. {-1, -1, -1, -1, -1, -1}};
  2924. while (row.size() != 0) {
  2925. auto ind = row["text"];
  2926. MS_LOG(INFO) << ind.Shape();
  2927. TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
  2928. TensorPtr de_expected_item;
  2929. dsize_t dim = 6;
  2930. ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
  2931. mindspore::MSTensor ms_expected_item =
  2932. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
  2933. EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
  2934. ASSERT_OK(iter->GetNextRow(&row));
  2935. i++;
  2936. }
  2937. EXPECT_EQ(i, 7);
  2938. // Manually terminate the pipeline
  2939. iter->Stop();
  2940. }
  2941. /// Feature: Vectors
  2942. /// Description: test with all parameters which include `path` and `max_vectors` in function BuildFromFile and `token`,
  2943. /// `unknown_init` and `lower_case_backup` in function Lookup. But some tokens have some big letters
  2944. /// Expectation: return correct MSTensor which is equal to the expected
  2945. TEST_F(MindDataTestPipeline, TestVectorsAllParams) {
  2946. // Test with all parameters.
  2947. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsAllParams.";
  2948. // Create a TextFile dataset
  2949. std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
  2950. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  2951. EXPECT_NE(ds, nullptr);
  2952. std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors.txt";
  2953. std::shared_ptr<Vectors> vectors;
  2954. Status s = Vectors::BuildFromFile(&vectors, vectors_dir);
  2955. EXPECT_EQ(s, Status::OK());
  2956. std::vector<float> unknown_init = {-1, -1, -1, -1, -1, -1};
  2957. std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(vectors, unknown_init, true);
  2958. EXPECT_NE(lookup, nullptr);
  2959. // Create Map operation on ds
  2960. ds = ds->Map({lookup}, {"text"});
  2961. EXPECT_NE(ds, nullptr);
  2962. // Create an iterator over the result of the above dataset
  2963. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  2964. EXPECT_NE(iter, nullptr);
  2965. // Iterate the dataset and get each row
  2966. std::unordered_map<std::string, mindspore::MSTensor> row;
  2967. ASSERT_OK(iter->GetNextRow(&row));
  2968. uint64_t i = 0;
  2969. std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
  2970. {-1, -1, -1, -1, -1, -1},
  2971. {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
  2972. {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
  2973. {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
  2974. {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
  2975. {-1, -1, -1, -1, -1, -1}};
  2976. while (row.size() != 0) {
  2977. auto ind = row["text"];
  2978. MS_LOG(INFO) << ind.Shape();
  2979. TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
  2980. TensorPtr de_expected_item;
  2981. dsize_t dim = 6;
  2982. ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
  2983. mindspore::MSTensor ms_expected_item =
  2984. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
  2985. EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
  2986. ASSERT_OK(iter->GetNextRow(&row));
  2987. i++;
  2988. }
  2989. EXPECT_EQ(i, 7);
  2990. // Manually terminate the pipeline
  2991. iter->Stop();
  2992. }
  2993. /// Feature: Vectors
  2994. /// Description: test with pre-vectors set that have the different dimension
  2995. /// Expectation: throw correct error and message
  2996. TEST_F(MindDataTestPipeline, TestVectorsDifferentDimension) {
  2997. // Tokens don't have the same number of vectors.
  2998. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsDifferentDimension.";
  2999. // Create a TextFile dataset
  3000. std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
  3001. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3002. EXPECT_NE(ds, nullptr);
  3003. std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors_dim_different.txt";
  3004. std::shared_ptr<Vectors> vectors;
  3005. Status s = Vectors::BuildFromFile(&vectors, vectors_dir, 100);
  3006. EXPECT_NE(s, Status::OK());
  3007. }
  3008. /// Feature: Vectors
  3009. /// Description: test with pre-vectors set that has the head-info
  3010. /// Expectation: return correct MSTensor which is equal to the expected
  3011. TEST_F(MindDataTestPipeline, TestVectorsWithHeadInfo) {
  3012. // Test with words that has head info.
  3013. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsWithHeadInfo.";
  3014. // Create a TextFile dataset
  3015. std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
  3016. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3017. EXPECT_NE(ds, nullptr);
  3018. std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors_with_info.txt";
  3019. std::shared_ptr<Vectors> vectors;
  3020. Status s = Vectors::BuildFromFile(&vectors, vectors_dir);
  3021. EXPECT_EQ(s, Status::OK());
  3022. std::vector<float> unknown_init = {-1, -1, -1, -1, -1, -1};
  3023. std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(vectors, unknown_init, true);
  3024. EXPECT_NE(lookup, nullptr);
  3025. // Create Map operation on ds
  3026. ds = ds->Map({lookup}, {"text"});
  3027. EXPECT_NE(ds, nullptr);
  3028. // Create an iterator over the result of the above dataset
  3029. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  3030. EXPECT_NE(iter, nullptr);
  3031. // Iterate the dataset and get each row
  3032. std::unordered_map<std::string, mindspore::MSTensor> row;
  3033. ASSERT_OK(iter->GetNextRow(&row));
  3034. uint64_t i = 0;
  3035. std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
  3036. {-1, -1, -1, -1, -1, -1},
  3037. {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
  3038. {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
  3039. {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
  3040. {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
  3041. {-1, -1, -1, -1, -1, -1}};
  3042. while (row.size() != 0) {
  3043. auto ind = row["text"];
  3044. MS_LOG(INFO) << ind.Shape();
  3045. TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
  3046. TensorPtr de_expected_item;
  3047. dsize_t dim = 6;
  3048. ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
  3049. mindspore::MSTensor ms_expected_item =
  3050. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
  3051. EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
  3052. ASSERT_OK(iter->GetNextRow(&row));
  3053. i++;
  3054. }
  3055. EXPECT_EQ(i, 7);
  3056. // Manually terminate the pipeline
  3057. iter->Stop();
  3058. }
  3059. /// Feature: Vectors
  3060. /// Description: test with the parameter max_vectors that is <= 0
  3061. /// Expectation: throw correct error and message
  3062. TEST_F(MindDataTestPipeline, TestVectorsMaxVectorsLessThanZero) {
  3063. // Test with max_vectors <= 0.
  3064. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsMaxVectorsLessThanZero.";
  3065. // Create a TextFile dataset
  3066. std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
  3067. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3068. EXPECT_NE(ds, nullptr);
  3069. std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors.txt";
  3070. std::shared_ptr<Vectors> vectors;
  3071. Status s = Vectors::BuildFromFile(&vectors, vectors_dir, -1);
  3072. EXPECT_NE(s, Status::OK());
  3073. }
  3074. /// Feature: Vectors
  3075. /// Description: test with the pre-vectors file that is empty
  3076. /// Expectation: throw correct error and message
  3077. TEST_F(MindDataTestPipeline, TestVectorsWithEmptyFile) {
  3078. // Read empty file.
  3079. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsWithEmptyFile.";
  3080. // Create a TextFile dataset
  3081. std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
  3082. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3083. EXPECT_NE(ds, nullptr);
  3084. std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors_empty.txt";
  3085. std::shared_ptr<Vectors> vectors;
  3086. Status s = Vectors::BuildFromFile(&vectors, vectors_dir);
  3087. EXPECT_NE(s, Status::OK());
  3088. }
  3089. /// Feature: Vectors
  3090. /// Description: test with the pre-vectors file that is not exist
  3091. /// Expectation: throw correct error and message
  3092. TEST_F(MindDataTestPipeline, TestVectorsWithNotExistFile) {
  3093. // Test with not exist file.
  3094. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsWithNotExistFile.";
  3095. // Create a TextFile dataset
  3096. std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
  3097. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3098. EXPECT_NE(ds, nullptr);
  3099. std::string vectors_dir = datasets_root_path_ + "/testVectors/no_vectors.txt";
  3100. std::shared_ptr<Vectors> vectors;
  3101. Status s = Vectors::BuildFromFile(&vectors, vectors_dir);
  3102. EXPECT_NE(s, Status::OK());
  3103. }
  3104. /// Feature: Vectors
  3105. /// Description: test with the pre-vectors set that has a situation that info-head is not the first line in the set
  3106. /// Expectation: throw correct error and message
  3107. TEST_F(MindDataTestPipeline, TestVectorsWithWrongInfoFile) {
  3108. // Wrong info.
  3109. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsWithWrongInfoFile.";
  3110. // Create a TextFile dataset
  3111. std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
  3112. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3113. EXPECT_NE(ds, nullptr);
  3114. std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors_with_wrong_info.txt";
  3115. std::shared_ptr<Vectors> vectors;
  3116. Status s = Vectors::BuildFromFile(&vectors, vectors_dir);
  3117. EXPECT_NE(s, Status::OK());
  3118. }
  3119. /// Feature: FastText
  3120. /// Description: test with default parameter in function BuildFromFile and function Lookup
  3121. /// Expectation: return correct MSTensor which is equal to the expected
  3122. TEST_F(MindDataTestPipeline, TestFastTextDefaultParam) {
  3123. // Test with default parameter.
  3124. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextDefaultParam.";
  3125. // Create a TextFile dataset
  3126. std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
  3127. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3128. EXPECT_NE(ds, nullptr);
  3129. std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fast_text.vec";
  3130. std::shared_ptr<FastText> fast_text;
  3131. Status s = FastText::BuildFromFile(&fast_text, vectors_dir);
  3132. EXPECT_EQ(s, Status::OK());
  3133. std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(fast_text);
  3134. EXPECT_NE(lookup, nullptr);
  3135. // Create Map operation on ds
  3136. ds = ds->Map({lookup}, {"text"});
  3137. EXPECT_NE(ds, nullptr);
  3138. // Create an iterator over the result of the above dataset
  3139. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  3140. EXPECT_NE(iter, nullptr);
  3141. // Iterate the dataset and get each row
  3142. std::unordered_map<std::string, mindspore::MSTensor> row;
  3143. ASSERT_OK(iter->GetNextRow(&row));
  3144. uint64_t i = 0;
  3145. std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
  3146. {0, 0, 0, 0, 0, 0},
  3147. {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
  3148. {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
  3149. {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
  3150. {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
  3151. {0, 0, 0, 0, 0, 0}};
  3152. while (row.size() != 0) {
  3153. auto ind = row["text"];
  3154. MS_LOG(INFO) << ind.Shape();
  3155. TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
  3156. TensorPtr de_expected_item;
  3157. dsize_t dim = 6;
  3158. ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
  3159. mindspore::MSTensor ms_expected_item =
  3160. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
  3161. EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
  3162. ASSERT_OK(iter->GetNextRow(&row));
  3163. i++;
  3164. }
  3165. EXPECT_EQ(i, 7);
  3166. // Manually terminate the pipeline
  3167. iter->Stop();
  3168. }
  3169. /// Feature: FastText
  3170. /// Description: test with all parameters which include `path` and `max_vector` in function BuildFromFile
  3171. /// Expectation: return correct MSTensor which is equal to the expected
  3172. TEST_F(MindDataTestPipeline, TestFastTextAllBuildfromfileParams) {
  3173. // Test with two parameters.
  3174. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextAllBuildfromfileParams.";
  3175. // Create a TextFile dataset
  3176. std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
  3177. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3178. EXPECT_NE(ds, nullptr);
  3179. std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fast_text.vec";
  3180. std::shared_ptr<FastText> fast_text;
  3181. Status s = FastText::BuildFromFile(&fast_text, vectors_dir, 100);
  3182. EXPECT_EQ(s, Status::OK());
  3183. std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(fast_text);
  3184. EXPECT_NE(lookup, nullptr);
  3185. // Create Map operation on ds
  3186. ds = ds->Map({lookup}, {"text"});
  3187. EXPECT_NE(ds, nullptr);
  3188. // Create an iterator over the result of the above dataset
  3189. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  3190. EXPECT_NE(iter, nullptr);
  3191. // Iterate the dataset and get each row
  3192. std::unordered_map<std::string, mindspore::MSTensor> row;
  3193. ASSERT_OK(iter->GetNextRow(&row));
  3194. uint64_t i = 0;
  3195. std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
  3196. {0, 0, 0, 0, 0, 0},
  3197. {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
  3198. {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
  3199. {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
  3200. {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
  3201. {0, 0, 0, 0, 0, 0}};
  3202. while (row.size() != 0) {
  3203. auto ind = row["text"];
  3204. MS_LOG(INFO) << ind.Shape();
  3205. TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
  3206. TensorPtr de_expected_item;
  3207. dsize_t dim = 6;
  3208. ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
  3209. mindspore::MSTensor ms_expected_item =
  3210. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
  3211. EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
  3212. ASSERT_OK(iter->GetNextRow(&row));
  3213. i++;
  3214. }
  3215. EXPECT_EQ(i, 7);
  3216. // Manually terminate the pipeline
  3217. iter->Stop();
  3218. }
  3219. /// Feature: FastText
  3220. /// Description: test with all parameters in function BuildFromFile and `unknown_init` in function Lookup
  3221. /// Expectation: return correct MSTensor which is equal to the expected
  3222. TEST_F(MindDataTestPipeline, TestFastTextUnknownInit) {
  3223. // Test with two parameters.
  3224. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextUnknownInit.";
  3225. // Create a TextFile dataset
  3226. std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
  3227. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3228. EXPECT_NE(ds, nullptr);
  3229. std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fast_text.vec";
  3230. std::shared_ptr<FastText> fast_text;
  3231. Status s = FastText::BuildFromFile(&fast_text, vectors_dir, 100);
  3232. EXPECT_EQ(s, Status::OK());
  3233. std::vector<float> unknown_init = {-1, -1, -1, -1, -1, -1};
  3234. std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(fast_text, unknown_init);
  3235. EXPECT_NE(lookup, nullptr);
  3236. // Create Map operation on ds
  3237. ds = ds->Map({lookup}, {"text"});
  3238. EXPECT_NE(ds, nullptr);
  3239. // Create an iterator over the result of the above dataset
  3240. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  3241. EXPECT_NE(iter, nullptr);
  3242. // Iterate the dataset and get each row
  3243. std::unordered_map<std::string, mindspore::MSTensor> row;
  3244. ASSERT_OK(iter->GetNextRow(&row));
  3245. uint64_t i = 0;
  3246. std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
  3247. {-1, -1, -1, -1, -1, -1},
  3248. {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
  3249. {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
  3250. {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
  3251. {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
  3252. {-1, -1, -1, -1, -1, -1}};
  3253. while (row.size() != 0) {
  3254. auto ind = row["text"];
  3255. MS_LOG(INFO) << ind.Shape();
  3256. TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
  3257. TensorPtr de_expected_item;
  3258. dsize_t dim = 6;
  3259. ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
  3260. mindspore::MSTensor ms_expected_item =
  3261. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
  3262. EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
  3263. ASSERT_OK(iter->GetNextRow(&row));
  3264. i++;
  3265. }
  3266. EXPECT_EQ(i, 7);
  3267. // Manually terminate the pipeline
  3268. iter->Stop();
  3269. }
  3270. /// Feature: FastText
  3271. /// Description: test with all parameters which include `path` and `max_vectors` in function BuildFromFile and `token`,
  3272. /// `unknown_init` and `lower_case_backup` in function Lookup. But some tokens have some big letters
  3273. /// Expectation: return correct MSTensor which is equal to the expected
  3274. TEST_F(MindDataTestPipeline, TestFastTextAllParams) {
  3275. // Test with all parameters.
  3276. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextAllParams.";
  3277. // Create a TextFile dataset
  3278. std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
  3279. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3280. EXPECT_NE(ds, nullptr);
  3281. std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fast_text.vec";
  3282. std::shared_ptr<FastText> fast_text;
  3283. Status s = FastText::BuildFromFile(&fast_text, vectors_dir);
  3284. EXPECT_EQ(s, Status::OK());
  3285. std::vector<float> unknown_init = {-1, -1, -1, -1, -1, -1};
  3286. std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(fast_text, unknown_init, true);
  3287. EXPECT_NE(lookup, nullptr);
  3288. // Create Map operation on ds
  3289. ds = ds->Map({lookup}, {"text"});
  3290. EXPECT_NE(ds, nullptr);
  3291. // Create an iterator over the result of the above dataset
  3292. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  3293. EXPECT_NE(iter, nullptr);
  3294. // Iterate the dataset and get each row
  3295. std::unordered_map<std::string, mindspore::MSTensor> row;
  3296. ASSERT_OK(iter->GetNextRow(&row));
  3297. uint64_t i = 0;
  3298. std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
  3299. {-1, -1, -1, -1, -1, -1},
  3300. {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
  3301. {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
  3302. {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
  3303. {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
  3304. {-1, -1, -1, -1, -1, -1}};
  3305. while (row.size() != 0) {
  3306. auto ind = row["text"];
  3307. MS_LOG(INFO) << ind.Shape();
  3308. TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
  3309. TensorPtr de_expected_item;
  3310. dsize_t dim = 6;
  3311. ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
  3312. mindspore::MSTensor ms_expected_item =
  3313. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
  3314. EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
  3315. ASSERT_OK(iter->GetNextRow(&row));
  3316. i++;
  3317. }
  3318. EXPECT_EQ(i, 7);
  3319. // Manually terminate the pipeline
  3320. iter->Stop();
  3321. }
  3322. /// Feature: FastText
  3323. /// Description: test with pre-vectors set that have the different dimension
  3324. /// Expectation: throw correct error and message
  3325. TEST_F(MindDataTestPipeline, TestFastTextDifferentDimension) {
  3326. // Tokens don't have the same number of vectors.
  3327. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextDifferentDimension.";
  3328. // Create a TextFile dataset
  3329. std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
  3330. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3331. EXPECT_NE(ds, nullptr);
  3332. std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fasttext_dim_different.vec";
  3333. std::shared_ptr<FastText> fast_text;
  3334. Status s = FastText::BuildFromFile(&fast_text, vectors_dir, 100);
  3335. EXPECT_NE(s, Status::OK());
  3336. }
  3337. /// Feature: FastText
  3338. /// Description: test with the parameter max_vectors that is <= 0
  3339. /// Expectation: throw correct error and message
  3340. TEST_F(MindDataTestPipeline, TestFastTextMaxVectorsLessThanZero) {
  3341. // Test with max_vectors <= 0.
  3342. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextMaxVectorsLessThanZero.";
  3343. // Create a TextFile dataset
  3344. std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
  3345. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3346. EXPECT_NE(ds, nullptr);
  3347. std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fast_text.vec";
  3348. std::shared_ptr<FastText> fast_text;
  3349. Status s = FastText::BuildFromFile(&fast_text, vectors_dir, -1);
  3350. EXPECT_NE(s, Status::OK());
  3351. }
  3352. /// Feature: FastText
  3353. /// Description: test with the pre-vectors file that is empty
  3354. /// Expectation: throw correct error and message
  3355. TEST_F(MindDataTestPipeline, TestFastTextWithEmptyFile) {
  3356. // Read empty file.
  3357. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextWithEmptyFile.";
  3358. // Create a TextFile dataset
  3359. std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
  3360. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3361. EXPECT_NE(ds, nullptr);
  3362. std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fasttext_empty.vec";
  3363. std::shared_ptr<FastText> fast_text;
  3364. Status s = FastText::BuildFromFile(&fast_text, vectors_dir);
  3365. EXPECT_NE(s, Status::OK());
  3366. }
  3367. /// Feature: FastText
  3368. /// Description: test with the pre-vectors file that is not exist
  3369. /// Expectation: throw correct error and message
  3370. TEST_F(MindDataTestPipeline, TestFastTextWithNotExistFile) {
  3371. // Test with not exist file.
  3372. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextWithNotExistFile.";
  3373. // Create a TextFile dataset
  3374. std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
  3375. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3376. EXPECT_NE(ds, nullptr);
  3377. std::string vectors_dir = datasets_root_path_ + "/test_fast_text/no_fasttext.vec";
  3378. std::shared_ptr<FastText> fast_text;
  3379. Status s = FastText::BuildFromFile(&fast_text, vectors_dir);
  3380. EXPECT_NE(s, Status::OK());
  3381. }
  3382. /// Feature: FastText
  3383. /// Description: test with the pre-vectors set that has a situation that info-head is not the first line in the set
  3384. /// Expectation: throw correct error and message
  3385. TEST_F(MindDataTestPipeline, TestFastTextWithWrongInfoFile) {
  3386. // Wrong info.
  3387. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextWithWrongInfoFile.";
  3388. // Create a TextFile dataset
  3389. std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
  3390. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3391. EXPECT_NE(ds, nullptr);
  3392. std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fasttext_with_wrong_info.vec";
  3393. std::shared_ptr<FastText> fast_text;
  3394. Status s = FastText::BuildFromFile(&fast_text, vectors_dir);
  3395. EXPECT_NE(s, Status::OK());
  3396. }
  3397. /// Feature: FastText
  3398. /// Description: test with the pre-vectors set that has a wrong suffix
  3399. /// Expectation: throw correct error and message
  3400. TEST_F(MindDataTestPipeline, TestFastTextWithWrongSuffix) {
  3401. // Wrong info.
  3402. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextWithWrongSuffix.";
  3403. // Create a TextFile dataset
  3404. std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
  3405. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3406. EXPECT_NE(ds, nullptr);
  3407. std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fast_text.txt";
  3408. std::shared_ptr<FastText> fast_text;
  3409. Status s = FastText::BuildFromFile(&fast_text, vectors_dir);
  3410. EXPECT_NE(s, Status::OK());
  3411. }
  3412. /// Feature: GloVe
  3413. /// Description: test with default parameter in function BuildFromFile and function Lookup
  3414. /// Expectation: return correct MSTensor which is equal to the expected
  3415. TEST_F(MindDataTestPipeline, TestGloVeDefaultParam) {
  3416. // Test with default parameter.
  3417. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestGloVeDefaultParam.";
  3418. // Create a TextFile dataset
  3419. std::string data_file = datasets_root_path_ + "/testGloVe/words.txt";
  3420. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3421. EXPECT_NE(ds, nullptr);
  3422. std::string vectors_dir = datasets_root_path_ + "/testGloVe/glove.6B.test.txt";
  3423. std::shared_ptr<GloVe> glove;
  3424. Status s = GloVe::BuildFromFile(&glove, vectors_dir);
  3425. EXPECT_EQ(s, Status::OK());
  3426. std::shared_ptr<TensorTransform> lookup =
  3427. std::make_shared<text::ToVectors>(glove);
  3428. EXPECT_NE(lookup, nullptr);
  3429. // Create Map operation on ds
  3430. ds = ds->Map({lookup}, {"text"});
  3431. EXPECT_NE(ds, nullptr);
  3432. // Create an iterator over the result of the above dataset
  3433. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  3434. EXPECT_NE(iter, nullptr);
  3435. // Iterate the dataset and get each row
  3436. std::unordered_map<std::string, mindspore::MSTensor> row;
  3437. ASSERT_OK(iter->GetNextRow(&row));
  3438. uint64_t i = 0;
  3439. std::vector<std::vector<float>> expected = {
  3440. {0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
  3441. {0, 0, 0, 0, 0, 0},
  3442. {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
  3443. {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
  3444. {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
  3445. {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
  3446. {0, 0, 0, 0, 0, 0}};
  3447. while (row.size() != 0) {
  3448. auto ind = row["text"];
  3449. MS_LOG(INFO) << ind.Shape();
  3450. TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
  3451. TensorPtr de_expected_item;
  3452. dsize_t dim = 6;
  3453. ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
  3454. mindspore::MSTensor ms_expected_item =
  3455. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
  3456. EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
  3457. ASSERT_OK(iter->GetNextRow(&row));
  3458. i++;
  3459. }
  3460. EXPECT_EQ(i, 7);
  3461. // Manually terminate the pipeline
  3462. iter->Stop();
  3463. }
  3464. /// Feature: GloVe
  3465. /// Description: test with all parameters which include `path` and `max_vector` in function BuildFromFile
  3466. /// Expectation: return correct MSTensor which is equal to the expected
  3467. TEST_F(MindDataTestPipeline, TestGloVeAllBuildfromfileParams) {
  3468. // Test with two parameters.
  3469. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestGloVeAllBuildfromfileParams.";
  3470. // Create a TextFile dataset
  3471. std::string data_file = datasets_root_path_ + "/testGloVe/words.txt";
  3472. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3473. EXPECT_NE(ds, nullptr);
  3474. std::string vectors_dir = datasets_root_path_ + "/testGloVe/glove.6B.test.txt";
  3475. std::shared_ptr<GloVe> glove;
  3476. Status s = GloVe::BuildFromFile(&glove, vectors_dir, 100);
  3477. EXPECT_EQ(s, Status::OK());
  3478. std::shared_ptr<TensorTransform> lookup =
  3479. std::make_shared<text::ToVectors>(glove);
  3480. EXPECT_NE(lookup, nullptr);
  3481. // Create Map operation on ds
  3482. ds = ds->Map({lookup}, {"text"});
  3483. EXPECT_NE(ds, nullptr);
  3484. // Create an iterator over the result of the above dataset
  3485. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  3486. EXPECT_NE(iter, nullptr);
  3487. // Iterate the dataset and get each row
  3488. std::unordered_map<std::string, mindspore::MSTensor> row;
  3489. ASSERT_OK(iter->GetNextRow(&row));
  3490. uint64_t i = 0;
  3491. std::vector<std::vector<float>> expected = {
  3492. {0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
  3493. {0, 0, 0, 0, 0, 0},
  3494. {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
  3495. {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
  3496. {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
  3497. {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
  3498. {0, 0, 0, 0, 0, 0}};
  3499. while (row.size() != 0) {
  3500. auto ind = row["text"];
  3501. MS_LOG(INFO) << ind.Shape();
  3502. TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
  3503. TensorPtr de_expected_item;
  3504. dsize_t dim = 6;
  3505. ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
  3506. mindspore::MSTensor ms_expected_item =
  3507. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
  3508. EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
  3509. ASSERT_OK(iter->GetNextRow(&row));
  3510. i++;
  3511. }
  3512. EXPECT_EQ(i, 7);
  3513. // Manually terminate the pipeline
  3514. iter->Stop();
  3515. }
  3516. /// Feature: GloVe
  3517. /// Description: test with all parameters in function BuildFromFile and `unknown_init` in function Lookup
  3518. /// Expectation: return correct MSTensor which is equal to the expected
  3519. TEST_F(MindDataTestPipeline, TestGloVeUnknownInit) {
  3520. // Test with two parameters.
  3521. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestGloVeUnknownInit.";
  3522. // Create a TextFile dataset
  3523. std::string data_file = datasets_root_path_ + "/testGloVe/words.txt";
  3524. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3525. EXPECT_NE(ds, nullptr);
  3526. std::string vectors_dir = datasets_root_path_ + "/testGloVe/glove.6B.test.txt";
  3527. std::shared_ptr<GloVe> glove;
  3528. Status s = GloVe::BuildFromFile(&glove, vectors_dir, 100);
  3529. EXPECT_EQ(s, Status::OK());
  3530. std::vector<float> unknown_init = {-1, -1, -1, -1, -1, -1};
  3531. std::shared_ptr<TensorTransform> lookup =
  3532. std::make_shared<text::ToVectors>(glove, unknown_init);
  3533. EXPECT_NE(lookup, nullptr);
  3534. // Create Map operation on ds
  3535. ds = ds->Map({lookup}, {"text"});
  3536. EXPECT_NE(ds, nullptr);
  3537. // Create an iterator over the result of the above dataset
  3538. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  3539. EXPECT_NE(iter, nullptr);
  3540. // Iterate the dataset and get each row
  3541. std::unordered_map<std::string, mindspore::MSTensor> row;
  3542. ASSERT_OK(iter->GetNextRow(&row));
  3543. uint64_t i = 0;
  3544. std::vector<std::vector<float>> expected = {
  3545. {0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
  3546. {-1, -1, -1, -1, -1, -1},
  3547. {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
  3548. {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
  3549. {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
  3550. {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
  3551. {-1, -1, -1, -1, -1, -1}};
  3552. while (row.size() != 0) {
  3553. auto ind = row["text"];
  3554. MS_LOG(INFO) << ind.Shape();
  3555. TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
  3556. TensorPtr de_expected_item;
  3557. dsize_t dim = 6;
  3558. ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
  3559. mindspore::MSTensor ms_expected_item =
  3560. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
  3561. EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
  3562. ASSERT_OK(iter->GetNextRow(&row));
  3563. i++;
  3564. }
  3565. EXPECT_EQ(i, 7);
  3566. // Manually terminate the pipeline
  3567. iter->Stop();
  3568. }
  3569. /// Feature: GloVe
  3570. /// Description: test with all parameters which include `path` and `max_vectors` in function BuildFromFile and `token`,
  3571. /// `unknown_init` and `lower_case_backup` in function Lookup. But some tokens have some big letters
  3572. /// Expectation: return correct MSTensor which is equal to the expected
  3573. TEST_F(MindDataTestPipeline, TestGloVeAllParams) {
  3574. // Test with all parameters.
  3575. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestGloVeAllParams.";
  3576. // Create a TextFile dataset
  3577. std::string data_file = datasets_root_path_ + "/testGloVe/words.txt";
  3578. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3579. EXPECT_NE(ds, nullptr);
  3580. std::string vectors_dir = datasets_root_path_ + "/testGloVe/glove.6B.test.txt";
  3581. std::shared_ptr<GloVe> glove;
  3582. Status s = GloVe::BuildFromFile(&glove, vectors_dir);
  3583. EXPECT_EQ(s, Status::OK());
  3584. std::vector<float> unknown_init = {-1, -1, -1, -1, -1, -1};
  3585. std::shared_ptr<TensorTransform> lookup =
  3586. std::make_shared<text::ToVectors>(glove, unknown_init, true);
  3587. EXPECT_NE(lookup, nullptr);
  3588. // Create Map operation on ds
  3589. ds = ds->Map({lookup}, {"text"});
  3590. EXPECT_NE(ds, nullptr);
  3591. // Create an iterator over the result of the above dataset
  3592. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  3593. EXPECT_NE(iter, nullptr);
  3594. // Iterate the dataset and get each row
  3595. std::unordered_map<std::string, mindspore::MSTensor> row;
  3596. ASSERT_OK(iter->GetNextRow(&row));
  3597. uint64_t i = 0;
  3598. std::vector<std::vector<float>> expected = {
  3599. {0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
  3600. {-1, -1, -1, -1, -1, -1},
  3601. {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
  3602. {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
  3603. {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
  3604. {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
  3605. {-1, -1, -1, -1, -1, -1}};
  3606. while (row.size() != 0) {
  3607. auto ind = row["text"];
  3608. MS_LOG(INFO) << ind.Shape();
  3609. TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
  3610. TensorPtr de_expected_item;
  3611. dsize_t dim = 6;
  3612. ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
  3613. mindspore::MSTensor ms_expected_item =
  3614. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
  3615. EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
  3616. ASSERT_OK(iter->GetNextRow(&row));
  3617. i++;
  3618. }
  3619. EXPECT_EQ(i, 7);
  3620. // Manually terminate the pipeline
  3621. iter->Stop();
  3622. }
  3623. /// Feature: GloVe
  3624. /// Description: test with pre-vectors set that have the different dimension
  3625. /// Expectation: throw correct error and message
  3626. TEST_F(MindDataTestPipeline, TestGloVeDifferentDimension) {
  3627. // Tokens don't have the same number of glove.
  3628. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestGloVeDifferentDimension.";
  3629. // Create a TextFile dataset
  3630. std::string data_file = datasets_root_path_ + "/testGloVe/words.txt";
  3631. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3632. EXPECT_NE(ds, nullptr);
  3633. std::string vectors_dir = datasets_root_path_ + "/testGloVe/glove.6B.dim_different.txt";
  3634. std::shared_ptr<GloVe> glove;
  3635. Status s = GloVe::BuildFromFile(&glove, vectors_dir, 100);
  3636. EXPECT_NE(s, Status::OK());
  3637. }
  3638. /// Feature: GloVe
  3639. /// Description: test with the parameter max_vectors that is <= 0
  3640. /// Expectation: throw correct error and message
  3641. TEST_F(MindDataTestPipeline, TestGloVeMaxVectorsLessThanZero) {
  3642. // Test with max_vectors <= 0.
  3643. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestGloVeMaxVectorsLessThanZero.";
  3644. // Create a TextFile dataset
  3645. std::string data_file = datasets_root_path_ + "/testGloVe/words.txt";
  3646. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3647. EXPECT_NE(ds, nullptr);
  3648. std::string vectors_dir = datasets_root_path_ + "/testGloVe/glove.6B.test.txt";
  3649. std::shared_ptr<GloVe> glove;
  3650. Status s = GloVe::BuildFromFile(&glove, vectors_dir, -1);
  3651. EXPECT_NE(s, Status::OK());
  3652. }
  3653. /// Feature: GloVe
  3654. /// Description: test with the pre-vectors file that is empty
  3655. /// Expectation: throw correct error and message
  3656. TEST_F(MindDataTestPipeline, TestGloVeWithEmptyFile) {
  3657. // Read empty file.
  3658. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestGloVeWithEmptyFile.";
  3659. // Create a TextFile dataset
  3660. std::string data_file = datasets_root_path_ + "/testGloVe/words.txt";
  3661. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3662. EXPECT_NE(ds, nullptr);
  3663. std::string vectors_dir = datasets_root_path_ + "/testGloVe/glove.6B.empty.txt";
  3664. std::shared_ptr<GloVe> glove;
  3665. Status s = GloVe::BuildFromFile(&glove, vectors_dir);
  3666. EXPECT_NE(s, Status::OK());
  3667. }
  3668. /// Feature: GloVe
  3669. /// Description: test with the pre-vectors file that is not exist
  3670. /// Expectation: throw correct error and message
  3671. TEST_F(MindDataTestPipeline, TestGloVeWithNotExistFile) {
  3672. // Test with not exist file.
  3673. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestGloVeWithNotExistFile.";
  3674. // Create a TextFile dataset
  3675. std::string data_file = datasets_root_path_ + "/testGloVe/words.txt";
  3676. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3677. EXPECT_NE(ds, nullptr);
  3678. std::string vectors_dir = datasets_root_path_ + "/testGloVe/glove.6B.empty.txt";
  3679. std::shared_ptr<GloVe> glove;
  3680. Status s = GloVe::BuildFromFile(&glove, vectors_dir);
  3681. EXPECT_NE(s, Status::OK());
  3682. }
  3683. /// Feature: GloVe
  3684. /// Description: test with the pre-vectors set that has a situation that info-head is not the first line in the set
  3685. /// Expectation: throw correct error and message
  3686. TEST_F(MindDataTestPipeline, TestGloVeWithWrongInfoFile) {
  3687. // Wrong info.
  3688. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestGloVeWithWrongInfoFile.";
  3689. // Create a TextFile dataset
  3690. std::string data_file = datasets_root_path_ + "/testGloVe/words.txt";
  3691. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3692. EXPECT_NE(ds, nullptr);
  3693. std::string vectors_dir = datasets_root_path_ + "/testGloVe/glove.6B.with_wrong_info.txt";
  3694. std::shared_ptr<GloVe> glove;
  3695. Status s = GloVe::BuildFromFile(&glove, vectors_dir);
  3696. EXPECT_NE(s, Status::OK());
  3697. }
  3698. /// Feature: GloVe
  3699. /// Description: test with the pre-vectors set that has a wrong format
  3700. /// Expectation: throw correct error and message
  3701. TEST_F(MindDataTestPipeline, TestGloVeWithWrongFormat) {
  3702. // Wrong info.
  3703. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestGloVeWithWrongFormat.";
  3704. // Create a TextFile dataset
  3705. std::string data_file = datasets_root_path_ + "/testGloVe/words.txt";
  3706. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3707. EXPECT_NE(ds, nullptr);
  3708. std::string vectors_dir = datasets_root_path_ + "/testGloVe/glove.6B.tests.vec";
  3709. std::shared_ptr<GloVe> glove;
  3710. Status s = GloVe::BuildFromFile(&glove, vectors_dir);
  3711. EXPECT_NE(s, Status::OK());
  3712. }
  3713. /// Feature: CharNGram
  3714. /// Description: test with default parameter in function BuildFromFile and function Lookup
  3715. /// Expectation: return correct MSTensor which is equal to the excepted
  3716. TEST_F(MindDataTestPipeline, TestCharNGramDefaultParam) {
  3717. // Test with default parameter.
  3718. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCharNGramDefaultParam.";
  3719. // Create a TextFile dataset
  3720. std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
  3721. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3722. EXPECT_NE(ds, nullptr);
  3723. std::string vectors_dir = datasets_root_path_ + "/testVectors/char_n_gram_20.txt";
  3724. std::shared_ptr<CharNGram> char_n_gram;
  3725. Status s = CharNGram::BuildFromFile(&char_n_gram, vectors_dir);
  3726. EXPECT_EQ(s, Status::OK());
  3727. std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(char_n_gram);
  3728. EXPECT_NE(lookup, nullptr);
  3729. // Create Map operation on ds
  3730. ds = ds->Map({lookup}, {"text"});
  3731. EXPECT_NE(ds, nullptr);
  3732. // Create an iterator over the result of the above dataset
  3733. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  3734. EXPECT_NE(iter, nullptr);
  3735. // Iterate the dataset and get each row
  3736. std::unordered_map<std::string, mindspore::MSTensor> row;
  3737. ASSERT_OK(iter->GetNextRow(&row));
  3738. uint64_t i = 0;
  3739. std::vector<std::vector<float>> expected = {{0,0,0,0,0},
  3740. {0,0,0,0,0},
  3741. {0.117336,0.362446,-0.983326,0.939264,-0.05648},
  3742. {0.657201,2.11761,-1.59276,0.432072,1.21395},
  3743. {0,0,0,0,0},
  3744. {-2.26956,0.288491,-0.740001,0.661703,0.147355},
  3745. {0,0,0,0,0}};
  3746. while (row.size() != 0) {
  3747. auto ind = row["text"];
  3748. MS_LOG(INFO) << ind.Shape();
  3749. TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
  3750. TensorPtr de_expected_item;
  3751. ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_item));
  3752. mindspore::MSTensor ms_expected_item =
  3753. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
  3754. std::vector<int64_t> ind_shape = ind.Shape();
  3755. std::vector<int64_t> ms_expected_shape = ms_expected_item.Shape();
  3756. EXPECT_EQ(ind_shape, ms_expected_shape);
  3757. ASSERT_OK(iter->GetNextRow(&row));
  3758. i++;
  3759. }
  3760. EXPECT_EQ(i, 7);
  3761. // Manually terminate the pipeline
  3762. iter->Stop();
  3763. }
  3764. /// Feature: CharNGram.
  3765. /// Description: test with all parameters which include `path` and `max_vector` in function BuildFromFile
  3766. /// Expectation: return correct MSTensor which is equal to the excepted
  3767. TEST_F(MindDataTestPipeline, TestCharNGramAllBuildfromfileParams) {
  3768. // Test with two parameters.
  3769. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCharNGramAllBuildfromfileParams.";
  3770. // Create a TextFile dataset
  3771. std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
  3772. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3773. EXPECT_NE(ds, nullptr);
  3774. std::string vectors_dir = datasets_root_path_ + "/testVectors/char_n_gram_20.txt";
  3775. std::shared_ptr<CharNGram> char_n_gram;
  3776. Status s = CharNGram::BuildFromFile(&char_n_gram, vectors_dir, 18);
  3777. EXPECT_EQ(s, Status::OK());
  3778. std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(char_n_gram);
  3779. EXPECT_NE(lookup, nullptr);
  3780. // Create Map operation on ds
  3781. ds = ds->Map({lookup}, {"text"});
  3782. EXPECT_NE(ds, nullptr);
  3783. // Create an iterator over the result of the above dataset
  3784. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  3785. EXPECT_NE(iter, nullptr);
  3786. // Iterate the dataset and get each row
  3787. std::unordered_map<std::string, mindspore::MSTensor> row;
  3788. ASSERT_OK(iter->GetNextRow(&row));
  3789. uint64_t i = 0;
  3790. std::vector<std::vector<float>> expected = {{0,0,0,0,0},
  3791. {0,0,0,0,0},
  3792. {-0.155665,0.664073,-0.538499,1.22657,-0.2162},
  3793. {0.657201,2.11761,-1.59276,0.432072,1.21395},
  3794. {0,0,0,0,0},
  3795. {-2.26956,0.288491,-0.740001,0.661703,0.147355},
  3796. {0,0,0,0,0}};
  3797. while (row.size() != 0) {
  3798. auto ind = row["text"];
  3799. MS_LOG(INFO) << ind.Shape();
  3800. TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
  3801. TensorPtr de_expected_item;
  3802. ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_item));
  3803. mindspore::MSTensor ms_expected_item =
  3804. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
  3805. std::vector<int64_t> ind_shape = ind.Shape();
  3806. std::vector<int64_t> ms_expected_shape = ms_expected_item.Shape();
  3807. EXPECT_EQ(ind_shape, ms_expected_shape);
  3808. ASSERT_OK(iter->GetNextRow(&row));
  3809. i++;
  3810. }
  3811. EXPECT_EQ(i, 7);
  3812. // Manually terminate the pipeline
  3813. iter->Stop();
  3814. }
  3815. /// Feature: CharNGram
  3816. /// Description: test with all parameters in function BuildFromFile and `unknown_init` in function Lookup
  3817. /// Expectation: return correct MSTensor which is equal to the excepted
  3818. TEST_F(MindDataTestPipeline, TestCharNGramUnknownInit) {
  3819. // Test with two parameters.
  3820. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCharNGramUnknownInit.";
  3821. // Create a TextFile dataset
  3822. std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
  3823. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3824. EXPECT_NE(ds, nullptr);
  3825. std::string vectors_dir = datasets_root_path_ + "/testVectors/char_n_gram_20.txt";
  3826. std::shared_ptr<CharNGram> char_n_gram;
  3827. Status s = CharNGram::BuildFromFile(&char_n_gram, vectors_dir, 18);
  3828. EXPECT_EQ(s, Status::OK());
  3829. std::vector<float> unknown_init(5, -1);
  3830. std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(char_n_gram, unknown_init);
  3831. EXPECT_NE(lookup, nullptr);
  3832. // Create Map operation on ds
  3833. ds = ds->Map({lookup}, {"text"});
  3834. EXPECT_NE(ds, nullptr);
  3835. // Create an iterator over the result of the above dataset
  3836. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  3837. EXPECT_NE(iter, nullptr);
  3838. // Iterate the dataset and get each row
  3839. std::unordered_map<std::string, mindspore::MSTensor> row;
  3840. ASSERT_OK(iter->GetNextRow(&row));
  3841. uint64_t i = 0;
  3842. std::vector<std::vector<float>> expected = {{-1,-1,-1,-1,-1},
  3843. {-1,-1,-1,-1,-1},
  3844. {-0.155665,0.664073,-0.538499,1.22657,-0.2162},
  3845. {0.657201,2.11761,-1.59276,0.432072,1.21395},
  3846. {-1,-1,-1,-1,-1},
  3847. {-2.26956,0.288491,-0.740001,0.661703,0.147355},
  3848. {-1,-1,-1,-1,-1}};
  3849. while (row.size() != 0) {
  3850. auto ind = row["text"];
  3851. MS_LOG(INFO) << ind.Shape();
  3852. TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
  3853. TensorPtr de_expected_item;
  3854. ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_item));
  3855. mindspore::MSTensor ms_expected_item =
  3856. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
  3857. std::vector<int64_t> ind_shape = ind.Shape();
  3858. std::vector<int64_t> ms_expected_shape = ms_expected_item.Shape();
  3859. EXPECT_EQ(ind_shape, ms_expected_shape);
  3860. ASSERT_OK(iter->GetNextRow(&row));
  3861. i++;
  3862. }
  3863. EXPECT_EQ(i, 7);
  3864. // Manually terminate the pipeline
  3865. iter->Stop();
  3866. }
  3867. /// Feature: CharNGram
  3868. /// Description: test with all parameters which include `path` and `max_vectors` in function BuildFromFile and `token`,
  3869. /// `unknown_init` and `lower_case_backup` in function Lookup. But some tokens have some big letters
  3870. /// Expectation: return correct MSTensor which is equal to the excepted
  3871. TEST_F(MindDataTestPipeline, TestCharNGramAllParams) {
  3872. // Test with all parameters.
  3873. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCharNGramAllParams.";
  3874. // Create a TextFile dataset
  3875. std::string data_file = datasets_root_path_ + "/testVectors/words_with_big_letter.txt";
  3876. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3877. EXPECT_NE(ds, nullptr);
  3878. std::string vectors_dir = datasets_root_path_ + "/testVectors/char_n_gram_20.txt";
  3879. std::shared_ptr<CharNGram> char_n_gram;
  3880. Status s = CharNGram::BuildFromFile(&char_n_gram, vectors_dir);
  3881. EXPECT_EQ(s, Status::OK());
  3882. std::vector<float> unknown_init(5, -1);
  3883. std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(char_n_gram, unknown_init, true);
  3884. EXPECT_NE(lookup, nullptr);
  3885. // Create Map operation on ds
  3886. ds = ds->Map({lookup}, {"text"});
  3887. EXPECT_NE(ds, nullptr);
  3888. // Create an iterator over the result of the above dataset
  3889. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  3890. EXPECT_NE(iter, nullptr);
  3891. // Iterate the dataset and get each row
  3892. std::unordered_map<std::string, mindspore::MSTensor> row;
  3893. ASSERT_OK(iter->GetNextRow(&row));
  3894. uint64_t i = 0;
  3895. std::vector<std::vector<float>> expected = {{-1,-1,-1,-1,-1},
  3896. {-1,-1,-1,-1,-1},
  3897. {0.117336,0.362446,-0.983326,0.939264,-0.05648},
  3898. {0.657201,2.11761,-1.59276,0.432072,1.21395},
  3899. {-1,-1,-1,-1,-1},
  3900. {-2.26956,0.288491,-0.740001,0.661703,0.147355},
  3901. {-1,-1,-1,-1,-1}};
  3902. while (row.size() != 0) {
  3903. auto ind = row["text"];
  3904. MS_LOG(INFO) << ind.Shape();
  3905. TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
  3906. TensorPtr de_expected_item;
  3907. ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_item));
  3908. mindspore::MSTensor ms_expected_item =
  3909. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
  3910. std::vector<int64_t> ind_shape = ind.Shape();
  3911. std::vector<int64_t> ms_expected_shape = ms_expected_item.Shape();
  3912. EXPECT_EQ(ind_shape, ms_expected_shape);
  3913. ASSERT_OK(iter->GetNextRow(&row));
  3914. i++;
  3915. }
  3916. EXPECT_EQ(i, 7);
  3917. // Manually terminate the pipeline
  3918. iter->Stop();
  3919. }
  3920. /// Feature: CharNGram
  3921. /// Description: test with pre-vectors set that have the different dimension
  3922. /// Expectation: throw correct error and message
  3923. TEST_F(MindDataTestPipeline, TestCharNGramDifferentDimension) {
  3924. // Tokens don't have the same number of vectors.
  3925. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCharNGramDifferentDimension.";
  3926. // Create a TextFile dataset
  3927. std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
  3928. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3929. EXPECT_NE(ds, nullptr);
  3930. std::string vectors_dir = datasets_root_path_ + "/testVectors/char_n_gram_20_dim_different.txt";
  3931. std::shared_ptr<CharNGram> char_n_gram;
  3932. Status s = CharNGram::BuildFromFile(&char_n_gram, vectors_dir);
  3933. EXPECT_NE(s, Status::OK());
  3934. }
  3935. /// Feature: CharNGram
  3936. /// Description: test with the parameter max_vectors that is <= 0
  3937. /// Expectation: throw correct error and message
  3938. TEST_F(MindDataTestPipeline, TestCharNGramMaxVectorsLessThanZero) {
  3939. // Test with max_vectors <= 0.
  3940. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCharNGramMaxVectorsLessThanZero.";
  3941. // Create a TextFile dataset
  3942. std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
  3943. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3944. EXPECT_NE(ds, nullptr);
  3945. std::string vectors_dir = datasets_root_path_ + "/testVectors/char_n_gram_20.txt";
  3946. std::shared_ptr<CharNGram> char_n_gram;
  3947. Status s = CharNGram::BuildFromFile(&char_n_gram, vectors_dir, -1);
  3948. EXPECT_NE(s, Status::OK());
  3949. }
  3950. /// Feature: CharNGram
  3951. /// Description: test with the pre-vectors file that is empty
  3952. /// Expectation: throw correct error and message
  3953. TEST_F(MindDataTestPipeline, TestCharNGramWithEmptyFile) {
  3954. // Read empty file.
  3955. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCharNGramWithEmptyFile.";
  3956. // Create a TextFile dataset
  3957. std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
  3958. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3959. EXPECT_NE(ds, nullptr);
  3960. std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors_empty.txt";
  3961. std::shared_ptr<CharNGram> char_n_gram;
  3962. Status s = CharNGram::BuildFromFile(&char_n_gram, vectors_dir);
  3963. EXPECT_NE(s, Status::OK());
  3964. }
  3965. /// Feature: CharNGram
  3966. /// Description: test with the pre-vectors file that is not exist
  3967. /// Expectation: throw correct error and message
  3968. TEST_F(MindDataTestPipeline, TestCharNGramsWithNotExistFile) {
  3969. // Test with not exist file.
  3970. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCharNGramsWithNotExistFile.";
  3971. // Create a TextFile dataset
  3972. std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
  3973. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3974. EXPECT_NE(ds, nullptr);
  3975. std::string vectors_dir = datasets_root_path_ + "/testVectors/no_vectors.txt";
  3976. std::shared_ptr<CharNGram> char_n_gram;
  3977. Status s = CharNGram::BuildFromFile(&char_n_gram, vectors_dir);
  3978. EXPECT_NE(s, Status::OK());
  3979. }