You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

c_api_text_test.cc 200 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028
  1. /**
  2. * Copyright 2020-2021 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include <memory>
  17. #include <string>
  18. #include <vector>
  19. #include "common/common.h"
  20. #include "include/api/status.h"
  21. #include "minddata/dataset/include/dataset/config.h"
  22. #include "minddata/dataset/include/dataset/datasets.h"
  23. #include "minddata/dataset/include/dataset/text.h"
  24. #include "minddata/dataset/include/dataset/transforms.h"
  25. #include "minddata/dataset/text/char_n_gram.h"
  26. #include "minddata/dataset/text/fast_text.h"
  27. #include "minddata/dataset/text/glove.h"
  28. #include "minddata/dataset/text/vectors.h"
  29. using namespace mindspore::dataset;
  30. using mindspore::Status;
  31. using mindspore::dataset::CharNGram;
  32. using mindspore::dataset::FastText;
  33. using mindspore::dataset::GloVe;
  34. using mindspore::dataset::ShuffleMode;
  35. using mindspore::dataset::Tensor;
  36. using mindspore::dataset::Vectors;
  37. using mindspore::dataset::Vocab;
  38. class MindDataTestPipeline : public UT::DatasetOpTesting {
  39. protected:
  40. };
  41. TEST_F(MindDataTestPipeline, TestBasicTokenizerSuccess1) {
  42. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBasicTokenizerSuccess1.";
  43. // Test BasicTokenizer with default parameters
  44. // Create a TextFile dataset
  45. std::string data_file = datasets_root_path_ + "/testTokenizerData/basic_tokenizer.txt";
  46. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  47. EXPECT_NE(ds, nullptr);
  48. // Create Take operation on ds
  49. ds = ds->Take(6);
  50. EXPECT_NE(ds, nullptr);
  51. // Create BasicTokenizer operation on ds
  52. std::shared_ptr<TensorTransform> basic_tokenizer = std::make_shared<text::BasicTokenizer>();
  53. EXPECT_NE(basic_tokenizer, nullptr);
  54. // Create Map operation on ds
  55. ds = ds->Map({basic_tokenizer}, {"text"});
  56. EXPECT_NE(ds, nullptr);
  57. // Create an iterator over the result of the above dataset
  58. // This will trigger the creation of the Execution Tree and launch it.
  59. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  60. EXPECT_NE(iter, nullptr);
  61. // Iterate the dataset and get each row
  62. std::unordered_map<std::string, mindspore::MSTensor> row;
  63. ASSERT_OK(iter->GetNextRow(&row));
  64. std::vector<std::vector<std::string>> expected = {
  65. {"Welcome", "to", "Beijing", "北", "京", "欢", "迎", "您"},
  66. {"長", "風", "破", "浪", "會", "有", "時", ",", "直", "掛", "雲", "帆", "濟", "滄", "海"},
  67. {"😀", "嘿", "嘿", "😃", "哈", "哈", "😄", "大", "笑", "😁", "嘻", "嘻"},
  68. {"明", "朝", "(", "1368", "—", "1644", "年", ")", "和", "清", "朝", "(", "1644", "—", "1911", "年", ")",
  69. ",", "是", "中", "国", "封", "建", "王", "朝", "史", "上", "最", "后", "两", "个", "朝", "代"},
  70. {"明", "代", "(", "1368", "-", "1644", ")", "と", "清", "代", "(", "1644",
  71. "-", "1911", ")", "は", "、", "中", "国", "の", "封", "建", "王", "朝",
  72. "の", "歴", "史", "における", "最", "後", "の2つの", "王", "朝", "でした"},
  73. {"명나라", "(", "1368", "-", "1644", ")", "와", "청나라", "(", "1644", "-",
  74. "1911", ")", "는", "중국", "봉건", "왕조의", "역사에서", "마지막", "두", "왕조였다"}};
  75. uint64_t i = 0;
  76. while (row.size() != 0) {
  77. auto ind = row["text"];
  78. std::shared_ptr<Tensor> de_expected_tensor;
  79. ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
  80. mindspore::MSTensor expected_tensor =
  81. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  82. EXPECT_MSTENSOR_EQ(ind, expected_tensor);
  83. ASSERT_OK(iter->GetNextRow(&row));
  84. i++;
  85. }
  86. EXPECT_EQ(i, 6);
  87. // Manually terminate the pipeline
  88. iter->Stop();
  89. }
  90. TEST_F(MindDataTestPipeline, TestBasicTokenizerSuccess2) {
  91. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBasicTokenizerSuccess2.";
  92. // Test BasicTokenizer with lower_case true
  93. // Create a TextFile dataset
  94. std::string data_file = datasets_root_path_ + "/testTokenizerData/basic_tokenizer.txt";
  95. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  96. EXPECT_NE(ds, nullptr);
  97. // Create Skip operation on ds
  98. ds = ds->Skip(6);
  99. EXPECT_NE(ds, nullptr);
  100. // Create BasicTokenizer operation on ds
  101. std::shared_ptr<TensorTransform> basic_tokenizer = std::make_shared<text::BasicTokenizer>(true);
  102. EXPECT_NE(basic_tokenizer, nullptr);
  103. // Create Map operation on ds
  104. ds = ds->Map({basic_tokenizer}, {"text"});
  105. EXPECT_NE(ds, nullptr);
  106. // Create an iterator over the result of the above dataset
  107. // This will trigger the creation of the Execution Tree and launch it.
  108. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  109. EXPECT_NE(iter, nullptr);
  110. // Iterate the dataset and get each row
  111. std::unordered_map<std::string, mindspore::MSTensor> row;
  112. ASSERT_OK(iter->GetNextRow(&row));
  113. std::vector<std::string> expected = {"this", "is", "a", "funky", "string"};
  114. std::shared_ptr<Tensor> de_expected_tensor;
  115. ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
  116. mindspore::MSTensor expected_tensor =
  117. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  118. uint64_t i = 0;
  119. while (row.size() != 0) {
  120. auto ind = row["text"];
  121. EXPECT_MSTENSOR_EQ(ind, expected_tensor);
  122. ASSERT_OK(iter->GetNextRow(&row));
  123. i++;
  124. }
  125. EXPECT_EQ(i, 1);
  126. // Manually terminate the pipeline
  127. iter->Stop();
  128. }
  129. TEST_F(MindDataTestPipeline, TestBasicTokenizerSuccess3) {
  130. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBasicTokenizerSuccess3.";
  131. // Test BasicTokenizer with with_offsets true and lower_case true
  132. // Create a TextFile dataset
  133. std::string data_file = datasets_root_path_ + "/testTokenizerData/basic_tokenizer.txt";
  134. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  135. EXPECT_NE(ds, nullptr);
  136. // Create Skip operation on ds
  137. ds = ds->Skip(6);
  138. EXPECT_NE(ds, nullptr);
  139. // Create BasicTokenizer operation on ds
  140. std::shared_ptr<TensorTransform> basic_tokenizer =
  141. std::make_shared<text::BasicTokenizer>(true, false, NormalizeForm::kNone, true, true);
  142. EXPECT_NE(basic_tokenizer, nullptr);
  143. // Create Map operation on ds
  144. ds = ds->Map({basic_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"});
  145. EXPECT_NE(ds, nullptr);
  146. // Create an iterator over the result of the above dataset
  147. // This will trigger the creation of the Execution Tree and launch it.
  148. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  149. EXPECT_NE(iter, nullptr);
  150. // Iterate the dataset and get each row
  151. std::unordered_map<std::string, mindspore::MSTensor> row;
  152. ASSERT_OK(iter->GetNextRow(&row));
  153. std::vector<std::string> expected_tokens = {"this", "is", "a", "funky", "string"};
  154. std::vector<uint32_t> expected_offsets_start = {0, 5, 8, 10, 16};
  155. std::vector<uint32_t> expected_offsets_limit = {4, 7, 9, 15, 22};
  156. std::shared_ptr<Tensor> de_expected_tokens;
  157. ASSERT_OK(Tensor::CreateFromVector(expected_tokens, &de_expected_tokens));
  158. mindspore::MSTensor ms_expected_tokens =
  159. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tokens));
  160. std::shared_ptr<Tensor> de_expected_offsets_start;
  161. ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start, &de_expected_offsets_start));
  162. mindspore::MSTensor ms_expected_offsets_start =
  163. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_start));
  164. std::shared_ptr<Tensor> de_expected_offsets_limit;
  165. ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit, &de_expected_offsets_limit));
  166. mindspore::MSTensor ms_expected_offsets_limit =
  167. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_limit));
  168. uint64_t i = 0;
  169. while (row.size() != 0) {
  170. auto ind = row["token"];
  171. EXPECT_MSTENSOR_EQ(ind, ms_expected_tokens);
  172. auto start = row["offsets_start"];
  173. EXPECT_MSTENSOR_EQ(start, ms_expected_offsets_start);
  174. auto limit = row["offsets_limit"];
  175. EXPECT_MSTENSOR_EQ(limit, ms_expected_offsets_limit);
  176. ASSERT_OK(iter->GetNextRow(&row));
  177. i++;
  178. }
  179. EXPECT_EQ(i, 1);
  180. // Manually terminate the pipeline
  181. iter->Stop();
  182. }
  183. std::vector<std::string> list = {
  184. "床", "前", "明", "月", "光", "疑", "是", "地", "上", "霜", "举", "头",
  185. "望", "低", "思", "故", "乡", "繁", "體", "字", "嘿", "哈", "大", "笑",
  186. "嘻", "i", "am", "mak", "make", "small", "mistake", "##s", "during", "work", "##ing", "hour",
  187. "😀", "😃", "😄", "😁", "+", "/", "-", "=", "12", "28", "40", "16",
  188. " ", "I", "[CLS]", "[SEP]", "[UNK]", "[PAD]", "[MASK]", "[unused1]", "[unused10]"};
  189. TEST_F(MindDataTestPipeline, TestBertTokenizerSuccess1) {
  190. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerSuccess1.";
  191. // Test BertTokenizer with default parameters
  192. // Create a TextFile dataset
  193. std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
  194. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  195. EXPECT_NE(ds, nullptr);
  196. // Create Take operation on ds
  197. ds = ds->Take(4);
  198. EXPECT_NE(ds, nullptr);
  199. // Create a vocab from vector
  200. std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
  201. Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
  202. EXPECT_EQ(s, Status::OK());
  203. // Create BertTokenizer operation on ds
  204. std::shared_ptr<TensorTransform> bert_tokenizer = std::make_shared<text::BertTokenizer>(vocab);
  205. EXPECT_NE(bert_tokenizer, nullptr);
  206. // Create Map operation on ds
  207. ds = ds->Map({bert_tokenizer}, {"text"});
  208. EXPECT_NE(ds, nullptr);
  209. // Create an iterator over the result of the above dataset
  210. // This will trigger the creation of the Execution Tree and launch it.
  211. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  212. EXPECT_NE(iter, nullptr);
  213. // Iterate the dataset and get each row
  214. std::unordered_map<std::string, mindspore::MSTensor> row;
  215. ASSERT_OK(iter->GetNextRow(&row));
  216. std::vector<std::vector<std::string>> expected = {{"床", "前", "明", "月", "光"},
  217. {"疑", "是", "地", "上", "霜"},
  218. {"举", "头", "望", "明", "月"},
  219. {"低", "头", "思", "故", "乡"}};
  220. uint64_t i = 0;
  221. while (row.size() != 0) {
  222. auto ind = row["text"];
  223. std::shared_ptr<Tensor> de_expected_tensor;
  224. ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
  225. mindspore::MSTensor expected_tensor =
  226. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  227. EXPECT_MSTENSOR_EQ(ind, expected_tensor);
  228. ASSERT_OK(iter->GetNextRow(&row));
  229. i++;
  230. }
  231. EXPECT_EQ(i, 4);
  232. // Manually terminate the pipeline
  233. iter->Stop();
  234. }
  235. TEST_F(MindDataTestPipeline, TestBertTokenizerSuccess2) {
  236. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerSuccess2.";
  237. // Test BertTokenizer with lower_case true
  238. // Create a TextFile dataset
  239. std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
  240. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  241. EXPECT_NE(ds, nullptr);
  242. // Create Skip operation on ds
  243. ds = ds->Skip(4);
  244. EXPECT_NE(ds, nullptr);
  245. // Create Take operation on ds
  246. ds = ds->Take(1);
  247. EXPECT_NE(ds, nullptr);
  248. // Create a vocab from vector
  249. std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
  250. Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
  251. EXPECT_EQ(s, Status::OK());
  252. // Create BertTokenizer operation on ds
  253. std::shared_ptr<TensorTransform> bert_tokenizer =
  254. std::make_shared<text::BertTokenizer>(vocab, "##", 100, "[UNK]", true);
  255. EXPECT_NE(bert_tokenizer, nullptr);
  256. // Create Map operation on ds
  257. ds = ds->Map({bert_tokenizer}, {"text"});
  258. EXPECT_NE(ds, nullptr);
  259. // Create an iterator over the result of the above dataset
  260. // This will trigger the creation of the Execution Tree and launch it.
  261. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  262. EXPECT_NE(iter, nullptr);
  263. // Iterate the dataset and get each row
  264. std::unordered_map<std::string, mindspore::MSTensor> row;
  265. ASSERT_OK(iter->GetNextRow(&row));
  266. std::vector<std::string> expected = {"i", "am", "mak", "##ing", "small", "mistake",
  267. "##s", "during", "work", "##ing", "hour", "##s"};
  268. std::shared_ptr<Tensor> de_expected_tensor;
  269. ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
  270. mindspore::MSTensor expected_tensor =
  271. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  272. uint64_t i = 0;
  273. while (row.size() != 0) {
  274. auto ind = row["text"];
  275. EXPECT_MSTENSOR_EQ(ind, expected_tensor);
  276. ASSERT_OK(iter->GetNextRow(&row));
  277. i++;
  278. }
  279. EXPECT_EQ(i, 1);
  280. // Manually terminate the pipeline
  281. iter->Stop();
  282. }
  283. TEST_F(MindDataTestPipeline, TestBertTokenizerSuccess3) {
  284. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerSuccess3.";
  285. // Test BertTokenizer with normalization_form NFKC
  286. // Create a TextFile dataset
  287. std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
  288. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  289. EXPECT_NE(ds, nullptr);
  290. // Create Skip operation on ds
  291. ds = ds->Skip(5);
  292. EXPECT_NE(ds, nullptr);
  293. // Create Take operation on ds
  294. ds = ds->Take(2);
  295. EXPECT_NE(ds, nullptr);
  296. // Create a vocab from vector
  297. std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
  298. Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
  299. EXPECT_EQ(s, Status::OK());
  300. // Create BertTokenizer operation on ds
  301. std::shared_ptr<TensorTransform> bert_tokenizer =
  302. std::make_shared<text::BertTokenizer>(vocab, "##", 100, "[UNK]", false, false, NormalizeForm::kNfc);
  303. EXPECT_NE(bert_tokenizer, nullptr);
  304. // Create Map operation on ds
  305. ds = ds->Map({bert_tokenizer}, {"text"});
  306. EXPECT_NE(ds, nullptr);
  307. // Create an iterator over the result of the above dataset
  308. // This will trigger the creation of the Execution Tree and launch it.
  309. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  310. EXPECT_NE(iter, nullptr);
  311. // Iterate the dataset and get each row
  312. std::unordered_map<std::string, mindspore::MSTensor> row;
  313. ASSERT_OK(iter->GetNextRow(&row));
  314. std::vector<std::vector<std::string>> expected = {
  315. {"😀", "嘿", "嘿", "😃", "哈", "哈", "😄", "大", "笑", "😁", "嘻", "嘻"}, {"繁", "體", "字"}};
  316. uint64_t i = 0;
  317. while (row.size() != 0) {
  318. auto ind = row["text"];
  319. std::shared_ptr<Tensor> de_expected_tensor;
  320. ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
  321. mindspore::MSTensor expected_tensor =
  322. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  323. EXPECT_MSTENSOR_EQ(ind, expected_tensor);
  324. ASSERT_OK(iter->GetNextRow(&row));
  325. i++;
  326. }
  327. EXPECT_EQ(i, 2);
  328. // Manually terminate the pipeline
  329. iter->Stop();
  330. }
  331. TEST_F(MindDataTestPipeline, TestBertTokenizerSuccess4) {
  332. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerSuccess4.";
  333. // Test BertTokenizer with keep_whitespace true
  334. // Create a TextFile dataset
  335. std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
  336. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  337. EXPECT_NE(ds, nullptr);
  338. // Create Skip operation on ds
  339. ds = ds->Skip(7);
  340. EXPECT_NE(ds, nullptr);
  341. // Create Take operation on ds
  342. ds = ds->Take(1);
  343. EXPECT_NE(ds, nullptr);
  344. // Create a vocab from vector
  345. std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
  346. Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
  347. EXPECT_EQ(s, Status::OK());
  348. // Create BertTokenizer operation on ds
  349. std::shared_ptr<TensorTransform> bert_tokenizer =
  350. std::make_shared<text::BertTokenizer>(vocab, "##", 100, "[UNK]", false, true);
  351. EXPECT_NE(bert_tokenizer, nullptr);
  352. // Create Map operation on ds
  353. ds = ds->Map({bert_tokenizer}, {"text"});
  354. EXPECT_NE(ds, nullptr);
  355. // Create an iterator over the result of the above dataset
  356. // This will trigger the creation of the Execution Tree and launch it.
  357. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  358. EXPECT_NE(iter, nullptr);
  359. // Iterate the dataset and get each row
  360. std::unordered_map<std::string, mindspore::MSTensor> row;
  361. ASSERT_OK(iter->GetNextRow(&row));
  362. std::vector<std::string> expected = {"[UNK]", " ", "[CLS]"};
  363. std::shared_ptr<Tensor> de_expected_tensor;
  364. ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
  365. mindspore::MSTensor expected_tensor =
  366. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  367. uint64_t i = 0;
  368. while (row.size() != 0) {
  369. auto ind = row["text"];
  370. EXPECT_MSTENSOR_EQ(ind, expected_tensor);
  371. ASSERT_OK(iter->GetNextRow(&row));
  372. i++;
  373. }
  374. EXPECT_EQ(i, 1);
  375. // Manually terminate the pipeline
  376. iter->Stop();
  377. }
  378. TEST_F(MindDataTestPipeline, TestBertTokenizerSuccess5) {
  379. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerSuccess5.";
  380. // Test BertTokenizer with unknown_token empty and keep_whitespace true
  381. // Create a TextFile dataset
  382. std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
  383. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  384. EXPECT_NE(ds, nullptr);
  385. // Create Skip operation on ds
  386. ds = ds->Skip(7);
  387. EXPECT_NE(ds, nullptr);
  388. // Create Take operation on ds
  389. ds = ds->Take(1);
  390. EXPECT_NE(ds, nullptr);
  391. // Create a vocab from vector
  392. std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
  393. Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
  394. EXPECT_EQ(s, Status::OK());
  395. // Create BertTokenizer operation on ds
  396. std::shared_ptr<TensorTransform> bert_tokenizer =
  397. std::make_shared<text::BertTokenizer>(vocab, "##", 100, "", false, true);
  398. EXPECT_NE(bert_tokenizer, nullptr);
  399. // Create Map operation on ds
  400. ds = ds->Map({bert_tokenizer}, {"text"});
  401. EXPECT_NE(ds, nullptr);
  402. // Create an iterator over the result of the above dataset
  403. // This will trigger the creation of the Execution Tree and launch it.
  404. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  405. EXPECT_NE(iter, nullptr);
  406. // Iterate the dataset and get each row
  407. std::unordered_map<std::string, mindspore::MSTensor> row;
  408. ASSERT_OK(iter->GetNextRow(&row));
  409. std::vector<std::string> expected = {"unused", " ", "[CLS]"};
  410. std::shared_ptr<Tensor> de_expected_tensor;
  411. ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
  412. mindspore::MSTensor expected_tensor =
  413. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  414. uint64_t i = 0;
  415. while (row.size() != 0) {
  416. auto ind = row["text"];
  417. EXPECT_MSTENSOR_EQ(ind, expected_tensor);
  418. ASSERT_OK(iter->GetNextRow(&row));
  419. i++;
  420. }
  421. EXPECT_EQ(i, 1);
  422. // Manually terminate the pipeline
  423. iter->Stop();
  424. }
  425. TEST_F(MindDataTestPipeline, TestBertTokenizerSuccess6) {
  426. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerSuccess6.";
  427. // Test BertTokenizer with preserve_unused_token false, unknown_token empty and keep_whitespace true
  428. // Create a TextFile dataset
  429. std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
  430. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  431. EXPECT_NE(ds, nullptr);
  432. // Create Skip operation on ds
  433. ds = ds->Skip(7);
  434. EXPECT_NE(ds, nullptr);
  435. // Create Take operation on ds
  436. ds = ds->Take(1);
  437. EXPECT_NE(ds, nullptr);
  438. // Create a vocab from vector
  439. std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
  440. Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
  441. EXPECT_EQ(s, Status::OK());
  442. // Create BertTokenizer operation on ds
  443. std::shared_ptr<TensorTransform> bert_tokenizer =
  444. std::make_shared<text::BertTokenizer>(vocab, "##", 100, "", false, true, NormalizeForm::kNone, false);
  445. EXPECT_NE(bert_tokenizer, nullptr);
  446. // Create Map operation on ds
  447. ds = ds->Map({bert_tokenizer}, {"text"});
  448. EXPECT_NE(ds, nullptr);
  449. // Create an iterator over the result of the above dataset
  450. // This will trigger the creation of the Execution Tree and launch it.
  451. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  452. EXPECT_NE(iter, nullptr);
  453. // Iterate the dataset and get each row
  454. std::unordered_map<std::string, mindspore::MSTensor> row;
  455. ASSERT_OK(iter->GetNextRow(&row));
  456. std::vector<std::string> expected = {"unused", " ", "[", "CLS", "]"};
  457. std::shared_ptr<Tensor> de_expected_tensor;
  458. ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
  459. mindspore::MSTensor expected_tensor =
  460. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  461. uint64_t i = 0;
  462. while (row.size() != 0) {
  463. auto ind = row["text"];
  464. EXPECT_MSTENSOR_EQ(ind, expected_tensor);
  465. ASSERT_OK(iter->GetNextRow(&row));
  466. i++;
  467. }
  468. EXPECT_EQ(i, 1);
  469. // Manually terminate the pipeline
  470. iter->Stop();
  471. }
  472. TEST_F(MindDataTestPipeline, TestBertTokenizerSuccess7) {
  473. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerSuccess7.";
  474. // Test BertTokenizer with with_offsets true and lower_case true
  475. // Create a TextFile dataset
  476. std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
  477. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  478. EXPECT_NE(ds, nullptr);
  479. // Create Skip operation on ds
  480. ds = ds->Skip(4);
  481. EXPECT_NE(ds, nullptr);
  482. // Create Take operation on ds
  483. ds = ds->Take(1);
  484. EXPECT_NE(ds, nullptr);
  485. // Create a vocab from vector
  486. std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
  487. Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
  488. EXPECT_EQ(s, Status::OK());
  489. // Create BertTokenizer operation on ds
  490. std::shared_ptr<TensorTransform> bert_tokenizer =
  491. std::make_shared<text::BertTokenizer>(vocab, "##", 100, "[UNK]", true, false, NormalizeForm::kNone, true, true);
  492. EXPECT_NE(bert_tokenizer, nullptr);
  493. // Create Map operation on ds
  494. ds = ds->Map({bert_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"});
  495. EXPECT_NE(ds, nullptr);
  496. // Create an iterator over the result of the above dataset
  497. // This will trigger the creation of the Execution Tree and launch it.
  498. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  499. EXPECT_NE(iter, nullptr);
  500. // Iterate the dataset and get each row
  501. std::unordered_map<std::string, mindspore::MSTensor> row;
  502. ASSERT_OK(iter->GetNextRow(&row));
  503. std::vector<std::string> expected_tokens = {"i", "am", "mak", "##ing", "small", "mistake",
  504. "##s", "during", "work", "##ing", "hour", "##s"};
  505. std::vector<uint32_t> expected_offsets_start = {0, 2, 5, 8, 12, 18, 25, 27, 34, 38, 42, 46};
  506. std::vector<uint32_t> expected_offsets_limit = {1, 4, 8, 11, 17, 25, 26, 33, 38, 41, 46, 47};
  507. std::shared_ptr<Tensor> de_expected_tokens;
  508. ASSERT_OK(Tensor::CreateFromVector(expected_tokens, &de_expected_tokens));
  509. mindspore::MSTensor ms_expected_tokens =
  510. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tokens));
  511. std::shared_ptr<Tensor> de_expected_offsets_start;
  512. ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start, &de_expected_offsets_start));
  513. mindspore::MSTensor ms_expected_offsets_start =
  514. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_start));
  515. std::shared_ptr<Tensor> de_expected_offsets_limit;
  516. ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit, &de_expected_offsets_limit));
  517. mindspore::MSTensor ms_expected_offsets_limit =
  518. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_limit));
  519. uint64_t i = 0;
  520. while (row.size() != 0) {
  521. auto ind = row["token"];
  522. EXPECT_MSTENSOR_EQ(ind, ms_expected_tokens);
  523. auto start = row["offsets_start"];
  524. EXPECT_MSTENSOR_EQ(start, ms_expected_offsets_start);
  525. auto limit = row["offsets_limit"];
  526. EXPECT_MSTENSOR_EQ(limit, ms_expected_offsets_limit);
  527. ASSERT_OK(iter->GetNextRow(&row));
  528. i++;
  529. }
  530. EXPECT_EQ(i, 1);
  531. // Manually terminate the pipeline
  532. iter->Stop();
  533. }
  534. TEST_F(MindDataTestPipeline, TestBertTokenizerFail1) {
  535. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerFail1.";
  536. // Test BertTokenizer with nullptr vocab
  537. // Create a TextFile dataset
  538. std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
  539. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  540. EXPECT_NE(ds, nullptr);
  541. // Create BertTokenizer operation on ds
  542. std::shared_ptr<TensorTransform> bert_tokenizer = std::make_shared<text::BertTokenizer>(nullptr);
  543. EXPECT_NE(bert_tokenizer, nullptr);
  544. // Create a Map operation on ds
  545. ds = ds->Map({bert_tokenizer});
  546. EXPECT_NE(ds, nullptr);
  547. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  548. // Expect failure: invalid BertTokenizer input with nullptr vocab
  549. EXPECT_EQ(iter, nullptr);
  550. }
  551. TEST_F(MindDataTestPipeline, TestBertTokenizerFail2) {
  552. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerFail2.";
  553. // Test BertTokenizer with negative max_bytes_per_token
  554. // Create a TextFile dataset
  555. std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
  556. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  557. EXPECT_NE(ds, nullptr);
  558. // Create a vocab from vector
  559. std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
  560. Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
  561. EXPECT_EQ(s, Status::OK());
  562. // Create BertTokenizer operation on ds
  563. std::shared_ptr<TensorTransform> bert_tokenizer = std::make_shared<text::BertTokenizer>(vocab, "##", -1);
  564. EXPECT_NE(bert_tokenizer, nullptr);
  565. // Create a Map operation on ds
  566. ds = ds->Map({bert_tokenizer});
  567. EXPECT_NE(ds, nullptr);
  568. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  569. // Expect failure: invalid BertTokenizer input with nullptr vocab
  570. EXPECT_EQ(iter, nullptr);
  571. }
  572. TEST_F(MindDataTestPipeline, TestCaseFoldSuccess) {
  573. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCaseFoldSuccess.";
  574. // Create a TextFile dataset
  575. std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt";
  576. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  577. EXPECT_NE(ds, nullptr);
  578. // Create casefold operation on ds
  579. std::shared_ptr<TensorTransform> casefold = std::make_shared<text::CaseFold>();
  580. EXPECT_NE(casefold, nullptr);
  581. // Create Map operation on ds
  582. ds = ds->Map({casefold}, {"text"});
  583. EXPECT_NE(ds, nullptr);
  584. // Create an iterator over the result of the above dataset
  585. // This will trigger the creation of the Execution Tree and launch it.
  586. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  587. EXPECT_NE(iter, nullptr);
  588. // Iterate the dataset and get each row
  589. std::unordered_map<std::string, mindspore::MSTensor> row;
  590. ASSERT_OK(iter->GetNextRow(&row));
  591. std::vector<std::string> expected = {"welcome to beijing!", "北京欢迎您!", "我喜欢english!", " "};
  592. uint64_t i = 0;
  593. while (row.size() != 0) {
  594. auto ind = row["text"];
  595. std::shared_ptr<Tensor> de_expected_tensor;
  596. ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor));
  597. mindspore::MSTensor ms_expected_tensor =
  598. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  599. EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor);
  600. ASSERT_OK(iter->GetNextRow(&row));
  601. i++;
  602. }
  603. EXPECT_EQ(i, 4);
  604. // Manually terminate the pipeline
  605. iter->Stop();
  606. }
  607. /// Feature: FilterWikipediaXML
  608. /// Description: test FilterWikipediaXML in pipeline mode
  609. /// Expectation: the data is processed successfully
  610. TEST_F(MindDataTestPipeline, TestFilterWikipediaXMLSuccess) {
  611. // Testing the parameter of FilterWikipediaXML interface .
  612. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFilterWikipediaXMLSuccess.";
  613. // Create a TextFile dataset
  614. std::string data_file = datasets_root_path_ + "/testTokenizerData/2.txt";
  615. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  616. EXPECT_NE(ds, nullptr);
  617. // Create filter_wikipedia_xml operation on ds
  618. std::shared_ptr<TensorTransform> filter_wikipedia_xml = std::make_shared<text::FilterWikipediaXML>();
  619. EXPECT_NE(filter_wikipedia_xml, nullptr);
  620. // Create Map operation on ds
  621. ds = ds->Map({filter_wikipedia_xml}, {"text"});
  622. EXPECT_NE(ds, nullptr);
  623. // Create an iterator over the result of the above dataset
  624. // This will trigger the creation of the Execution Tree and launch it.
  625. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  626. EXPECT_NE(iter, nullptr);
  627. // Iterate the dataset and get each row
  628. std::unordered_map<std::string, mindspore::MSTensor> row;
  629. ASSERT_OK(iter->GetNextRow(&row));
  630. std::vector<std::string> expected = {"welcome to beijing", "", ""};
  631. uint64_t i = 0;
  632. while (row.size() != 0) {
  633. auto ind = row["text"];
  634. std::shared_ptr<Tensor> de_expected_tensor;
  635. ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor));
  636. mindspore::MSTensor ms_expected_tensor =
  637. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  638. EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor);
  639. ASSERT_OK(iter->GetNextRow(&row));
  640. i++;
  641. }
  642. EXPECT_EQ(i, 3);
  643. // Manually terminate the pipeline
  644. iter->Stop();
  645. }
  646. TEST_F(MindDataTestPipeline, TestJiebaTokenizerSuccess) {
  647. // Testing the parameter of JiebaTokenizer interface when the mode is JiebaMode::kMp and the with_offsets is false.
  648. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerSuccess.";
  649. // Create a TextFile dataset
  650. std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
  651. std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
  652. std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
  653. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  654. EXPECT_NE(ds, nullptr);
  655. // Create jieba_tokenizer operation on ds
  656. std::shared_ptr<TensorTransform> jieba_tokenizer =
  657. std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp);
  658. EXPECT_NE(jieba_tokenizer, nullptr);
  659. // Create Map operation on ds
  660. ds = ds->Map({jieba_tokenizer}, {"text"});
  661. EXPECT_NE(ds, nullptr);
  662. // Create an iterator over the result of the above dataset
  663. // This will trigger the creation of the Execution Tree and launch it.
  664. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  665. EXPECT_NE(iter, nullptr);
  666. // Iterate the dataset and get each row
  667. std::unordered_map<std::string, mindspore::MSTensor> row;
  668. ASSERT_OK(iter->GetNextRow(&row));
  669. std::vector<std::string> expected = {"今天天气", "太好了", "我们", "一起", "去", "外面", "玩吧"};
  670. std::shared_ptr<Tensor> de_expected_tensor;
  671. ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
  672. mindspore::MSTensor expected_tensor =
  673. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  674. uint64_t i = 0;
  675. while (row.size() != 0) {
  676. auto ind = row["text"];
  677. EXPECT_MSTENSOR_EQ(ind, expected_tensor);
  678. ASSERT_OK(iter->GetNextRow(&row));
  679. i++;
  680. }
  681. EXPECT_EQ(i, 1);
  682. // Manually terminate the pipeline
  683. iter->Stop();
  684. }
  685. TEST_F(MindDataTestPipeline, TestJiebaTokenizerSuccess1) {
  686. // Testing the parameter of JiebaTokenizer interface when the mode is JiebaMode::kHmm and the with_offsets is false.
  687. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerSuccess1.";
  688. // Create a TextFile dataset
  689. std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
  690. std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
  691. std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
  692. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  693. EXPECT_NE(ds, nullptr);
  694. // Create jieba_tokenizer operation on ds
  695. std::shared_ptr<TensorTransform> jieba_tokenizer =
  696. std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kHmm);
  697. EXPECT_NE(jieba_tokenizer, nullptr);
  698. // Create Map operation on ds
  699. ds = ds->Map({jieba_tokenizer}, {"text"});
  700. EXPECT_NE(ds, nullptr);
  701. // Create an iterator over the result of the above dataset
  702. // This will trigger the creation of the Execution Tree and launch it.
  703. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  704. EXPECT_NE(iter, nullptr);
  705. // Iterate the dataset and get each row
  706. std::unordered_map<std::string, mindspore::MSTensor> row;
  707. ASSERT_OK(iter->GetNextRow(&row));
  708. std::vector<std::string> expected = {"今天", "天气", "太", "好", "了", "我们", "一起", "去", "外面", "玩", "吧"};
  709. std::shared_ptr<Tensor> de_expected_tensor;
  710. ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
  711. mindspore::MSTensor expected_tensor =
  712. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  713. uint64_t i = 0;
  714. while (row.size() != 0) {
  715. auto ind = row["text"];
  716. EXPECT_MSTENSOR_EQ(ind, expected_tensor);
  717. ASSERT_OK(iter->GetNextRow(&row));
  718. i++;
  719. }
  720. EXPECT_EQ(i, 1);
  721. // Manually terminate the pipeline
  722. iter->Stop();
  723. }
  724. TEST_F(MindDataTestPipeline, TestJiebaTokenizerSuccess2) {
  725. // Testing the parameter of JiebaTokenizer interface when the mode is JiebaMode::kMp and the with_offsets is true.
  726. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerSuccess2.";
  727. // Create a TextFile dataset
  728. std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
  729. std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
  730. std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
  731. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  732. EXPECT_NE(ds, nullptr);
  733. // Create jieba_tokenizer operation on ds
  734. std::shared_ptr<TensorTransform> jieba_tokenizer =
  735. std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp, true);
  736. EXPECT_NE(jieba_tokenizer, nullptr);
  737. // Create Map operation on ds
  738. ds = ds->Map({jieba_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"},
  739. {"token", "offsets_start", "offsets_limit"});
  740. EXPECT_NE(ds, nullptr);
  741. // Create an iterator over the result of the above dataset
  742. // This will trigger the creation of the Execution Tree and launch it.
  743. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  744. EXPECT_NE(iter, nullptr);
  745. // Iterate the dataset and get each row
  746. std::unordered_map<std::string, mindspore::MSTensor> row;
  747. ASSERT_OK(iter->GetNextRow(&row));
  748. std::vector<std::string> expected_tokens = {"今天天气", "太好了", "我们", "一起", "去", "外面", "玩吧"};
  749. std::vector<uint32_t> expected_offsets_start = {0, 12, 21, 27, 33, 36, 42};
  750. std::vector<uint32_t> expected_offsets_limit = {12, 21, 27, 33, 36, 42, 48};
  751. std::shared_ptr<Tensor> de_expected_tokens;
  752. ASSERT_OK(Tensor::CreateFromVector(expected_tokens, &de_expected_tokens));
  753. mindspore::MSTensor ms_expected_tokens =
  754. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tokens));
  755. std::shared_ptr<Tensor> de_expected_offsets_start;
  756. ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start, &de_expected_offsets_start));
  757. mindspore::MSTensor ms_expected_offsets_start =
  758. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_start));
  759. std::shared_ptr<Tensor> de_expected_offsets_limit;
  760. ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit, &de_expected_offsets_limit));
  761. mindspore::MSTensor ms_expected_offsets_limit =
  762. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_limit));
  763. uint64_t i = 0;
  764. while (row.size() != 0) {
  765. auto ind = row["token"];
  766. EXPECT_MSTENSOR_EQ(ind, ms_expected_tokens);
  767. auto start = row["offsets_start"];
  768. EXPECT_MSTENSOR_EQ(start, ms_expected_offsets_start);
  769. auto limit = row["offsets_limit"];
  770. EXPECT_MSTENSOR_EQ(limit, ms_expected_offsets_limit);
  771. ASSERT_OK(iter->GetNextRow(&row));
  772. i++;
  773. }
  774. EXPECT_EQ(i, 1);
  775. // Manually terminate the pipeline
  776. iter->Stop();
  777. }
  778. TEST_F(MindDataTestPipeline, TestJiebaTokenizerFail1) {
  779. // Testing the incorrect parameter of JiebaTokenizer interface.
  780. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerFail1.";
  781. // Create a TextFile dataset
  782. std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
  783. std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
  784. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  785. EXPECT_NE(ds, nullptr);
  786. // Create jieba_tokenizer operation on ds
  787. // Testing the parameter hmm_path is empty
  788. std::shared_ptr<TensorTransform> jieba_tokenizer =
  789. std::make_shared<text::JiebaTokenizer>("", mp_path, JiebaMode::kMp);
  790. EXPECT_NE(jieba_tokenizer, nullptr);
  791. // Create a Map operation on ds
  792. ds = ds->Map({jieba_tokenizer});
  793. EXPECT_NE(ds, nullptr);
  794. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  795. // Expect failure: invalid JiebaTokenizer input (parameter hmm_path is empty)
  796. EXPECT_EQ(iter, nullptr);
  797. }
  798. TEST_F(MindDataTestPipeline, TestJiebaTokenizerFail2) {
  799. // Testing the incorrect parameter of JiebaTokenizer interface.
  800. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerFail2.";
  801. // Create a TextFile dataset
  802. std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
  803. std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
  804. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  805. EXPECT_NE(ds, nullptr);
  806. // Create jieba_tokenizer operation on ds
  807. // Testing the parameter mp_path is empty
  808. std::shared_ptr<TensorTransform> jieba_tokenizer =
  809. std::make_shared<text::JiebaTokenizer>(hmm_path, "", JiebaMode::kMp);
  810. EXPECT_NE(jieba_tokenizer, nullptr);
  811. // Create a Map operation on ds
  812. ds = ds->Map({jieba_tokenizer});
  813. EXPECT_NE(ds, nullptr);
  814. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  815. // Expect failure: invalid JiebaTokenizer input (parameter mp_path is empty)
  816. EXPECT_EQ(iter, nullptr);
  817. }
  818. TEST_F(MindDataTestPipeline, TestJiebaTokenizerFail3) {
  819. // Testing the incorrect parameter of JiebaTokenizer interface.
  820. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerFail3.";
  821. // Create a TextFile dataset
  822. std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
  823. std::string hmm_path_invalid = datasets_root_path_ + "/jiebadict/1.txt";
  824. std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
  825. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  826. EXPECT_NE(ds, nullptr);
  827. // Create jieba_tokenizer operation on ds
  828. // Testing the parameter hmm_path is invalid path
  829. std::shared_ptr<TensorTransform> jieba_tokenizer =
  830. std::make_shared<text::JiebaTokenizer>(hmm_path_invalid, mp_path, JiebaMode::kMp);
  831. EXPECT_NE(jieba_tokenizer, nullptr);
  832. // Create a Map operation on ds
  833. ds = ds->Map({jieba_tokenizer});
  834. EXPECT_NE(ds, nullptr);
  835. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  836. // Expect failure: invalid JiebaTokenizer input (parameter hmm_path is invalid path)
  837. EXPECT_EQ(iter, nullptr);
  838. }
  839. TEST_F(MindDataTestPipeline, TestJiebaTokenizerFail4) {
  840. // Testing the incorrect parameter of JiebaTokenizer interface.
  841. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerFail4.";
  842. // Create a TextFile dataset
  843. std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
  844. std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
  845. std::string mp_path_invalid = datasets_root_path_ + "/jiebadict/1.txt";
  846. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  847. EXPECT_NE(ds, nullptr);
  848. // Create jieba_tokenizer operation on ds
  849. // Testing the parameter mp_path is invalid path
  850. std::shared_ptr<TensorTransform> jieba_tokenizer =
  851. std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path_invalid, JiebaMode::kMp);
  852. EXPECT_NE(jieba_tokenizer, nullptr);
  853. // Create a Map operation on ds
  854. ds = ds->Map({jieba_tokenizer});
  855. EXPECT_NE(ds, nullptr);
  856. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  857. // Expect failure: invalid JiebaTokenizer input (parameter mp_path is invalid path)
  858. EXPECT_EQ(iter, nullptr);
  859. }
  860. TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWord) {
  861. // Testing the parameter AddWord of JiebaTokenizer when the freq is not provided (default 0).
  862. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddWord.";
  863. // Create a TextFile dataset
  864. std::string data_file = datasets_root_path_ + "/testJiebaDataset/4.txt";
  865. std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
  866. std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
  867. std::shared_ptr<Dataset> ds = TextFile({data_file});
  868. EXPECT_NE(ds, nullptr);
  869. // Create jieba_tokenizer operation on ds
  870. std::shared_ptr<text::JiebaTokenizer> jieba_tokenizer =
  871. std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp);
  872. EXPECT_NE(jieba_tokenizer, nullptr);
  873. // Add word with freq not provided (default 0)
  874. ASSERT_OK(jieba_tokenizer->AddWord("男默女泪"));
  875. // Create Map operation on ds
  876. ds = ds->Map({jieba_tokenizer}, {"text"});
  877. EXPECT_NE(ds, nullptr);
  878. // Create an iterator over the result of the above dataset
  879. // This will trigger the creation of the Execution Tree and launch it.
  880. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  881. EXPECT_NE(iter, nullptr);
  882. // Iterate the dataset and get each row
  883. std::unordered_map<std::string, mindspore::MSTensor> row;
  884. ASSERT_OK(iter->GetNextRow(&row));
  885. std::vector<std::string> expected = {"男默女泪", "市", "长江大桥"};
  886. std::shared_ptr<Tensor> de_expected_tensor;
  887. ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
  888. mindspore::MSTensor expected_tensor =
  889. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  890. uint64_t i = 0;
  891. while (row.size() != 0) {
  892. auto ind = row["text"];
  893. EXPECT_MSTENSOR_EQ(ind, expected_tensor);
  894. ASSERT_OK(iter->GetNextRow(&row));
  895. i++;
  896. }
  897. EXPECT_EQ(i, 1);
  898. // Manually terminate the pipeline
  899. iter->Stop();
  900. }
  901. TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWord1) {
  902. // Testing the parameter AddWord of JiebaTokenizer when the freq is set explicitly to 0.
  903. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddWord1.";
  904. // Create a TextFile dataset
  905. std::string data_file = datasets_root_path_ + "/testJiebaDataset/4.txt";
  906. std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
  907. std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
  908. std::shared_ptr<Dataset> ds = TextFile({data_file});
  909. EXPECT_NE(ds, nullptr);
  910. // Create jieba_tokenizer operation on ds
  911. std::shared_ptr<text::JiebaTokenizer> jieba_tokenizer =
  912. std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp);
  913. EXPECT_NE(jieba_tokenizer, nullptr);
  914. // Add word with freq is set explicitly to 0
  915. ASSERT_OK(jieba_tokenizer->AddWord("男默女泪", 0));
  916. // Create Map operation on ds
  917. ds = ds->Map({jieba_tokenizer}, {"text"});
  918. EXPECT_NE(ds, nullptr);
  919. // Create an iterator over the result of the above dataset
  920. // This will trigger the creation of the Execution Tree and launch it.
  921. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  922. EXPECT_NE(iter, nullptr);
  923. // Iterate the dataset and get each row
  924. std::unordered_map<std::string, mindspore::MSTensor> row;
  925. ASSERT_OK(iter->GetNextRow(&row));
  926. std::vector<std::string> expected = {"男默女泪", "市", "长江大桥"};
  927. std::shared_ptr<Tensor> de_expected_tensor;
  928. ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
  929. mindspore::MSTensor expected_tensor =
  930. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  931. uint64_t i = 0;
  932. while (row.size() != 0) {
  933. auto ind = row["text"];
  934. EXPECT_MSTENSOR_EQ(ind, expected_tensor);
  935. ASSERT_OK(iter->GetNextRow(&row));
  936. i++;
  937. }
  938. EXPECT_EQ(i, 1);
  939. // Manually terminate the pipeline
  940. iter->Stop();
  941. }
  942. TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWord2) {
  943. // Testing the parameter AddWord of JiebaTokenizer when the freq is 10.
  944. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddWord2.";
  945. // Create a TextFile dataset
  946. std::string data_file = datasets_root_path_ + "/testJiebaDataset/4.txt";
  947. std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
  948. std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
  949. std::shared_ptr<Dataset> ds = TextFile({data_file});
  950. EXPECT_NE(ds, nullptr);
  951. // Create jieba_tokenizer operation on ds
  952. std::shared_ptr<text::JiebaTokenizer> jieba_tokenizer =
  953. std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp);
  954. EXPECT_NE(jieba_tokenizer, nullptr);
  955. // Add word with freq 10
  956. ASSERT_OK(jieba_tokenizer->AddWord("男默女泪", 10));
  957. // Create Map operation on ds
  958. ds = ds->Map({jieba_tokenizer}, {"text"});
  959. EXPECT_NE(ds, nullptr);
  960. // Create an iterator over the result of the above dataset
  961. // This will trigger the creation of the Execution Tree and launch it.
  962. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  963. EXPECT_NE(iter, nullptr);
  964. // Iterate the dataset and get each row
  965. std::unordered_map<std::string, mindspore::MSTensor> row;
  966. ASSERT_OK(iter->GetNextRow(&row));
  967. std::vector<std::string> expected = {"男默女泪", "市", "长江大桥"};
  968. std::shared_ptr<Tensor> de_expected_tensor;
  969. ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
  970. mindspore::MSTensor expected_tensor =
  971. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  972. uint64_t i = 0;
  973. while (row.size() != 0) {
  974. auto ind = row["text"];
  975. EXPECT_MSTENSOR_EQ(ind, expected_tensor);
  976. ASSERT_OK(iter->GetNextRow(&row));
  977. i++;
  978. }
  979. EXPECT_EQ(i, 1);
  980. // Manually terminate the pipeline
  981. iter->Stop();
  982. }
  983. TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWord3) {
  984. // Testing the parameter AddWord of JiebaTokenizer when the freq is 20000 which affects the result of segmentation.
  985. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddWord3.";
  986. // Create a TextFile dataset
  987. std::string data_file = datasets_root_path_ + "/testJiebaDataset/6.txt";
  988. std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
  989. std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
  990. std::shared_ptr<Dataset> ds = TextFile({data_file});
  991. EXPECT_NE(ds, nullptr);
  992. // Create jieba_tokenizer operation on ds
  993. std::shared_ptr<text::JiebaTokenizer> jieba_tokenizer =
  994. std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp);
  995. EXPECT_NE(jieba_tokenizer, nullptr);
  996. // Add word with freq 20000
  997. ASSERT_OK(jieba_tokenizer->AddWord("江大桥", 20000));
  998. // Create Map operation on ds
  999. ds = ds->Map({jieba_tokenizer}, {"text"});
  1000. EXPECT_NE(ds, nullptr);
  1001. // Create an iterator over the result of the above dataset
  1002. // This will trigger the creation of the Execution Tree and launch it.
  1003. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  1004. EXPECT_NE(iter, nullptr);
  1005. // Iterate the dataset and get each row
  1006. std::unordered_map<std::string, mindspore::MSTensor> row;
  1007. ASSERT_OK(iter->GetNextRow(&row));
  1008. std::vector<std::string> expected = {"江州", "市长", "江大桥", "参加", "了", "长江大桥", "的", "通车", "仪式"};
  1009. std::shared_ptr<Tensor> de_expected_tensor;
  1010. ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
  1011. mindspore::MSTensor expected_tensor =
  1012. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  1013. uint64_t i = 0;
  1014. while (row.size() != 0) {
  1015. auto ind = row["text"];
  1016. EXPECT_MSTENSOR_EQ(ind, expected_tensor);
  1017. ASSERT_OK(iter->GetNextRow(&row));
  1018. i++;
  1019. }
  1020. EXPECT_EQ(i, 1);
  1021. // Manually terminate the pipeline
  1022. iter->Stop();
  1023. }
  1024. TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWordFail) {
  1025. // Testing the incorrect parameter of AddWord in JiebaTokenizer.
  1026. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddWordFail.";
  1027. // Create a TextFile dataset
  1028. std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
  1029. std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
  1030. std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
  1031. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  1032. EXPECT_NE(ds, nullptr);
  1033. // Testing the parameter word of AddWord is empty
  1034. std::shared_ptr<text::JiebaTokenizer> jieba_tokenizer =
  1035. std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp);
  1036. EXPECT_NE(jieba_tokenizer, nullptr);
  1037. EXPECT_NE(jieba_tokenizer->AddWord("", 10), Status::OK());
  1038. // Testing the parameter freq of AddWord is negative
  1039. std::shared_ptr<text::JiebaTokenizer> jieba_tokenizer1 =
  1040. std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp);
  1041. EXPECT_NE(jieba_tokenizer1, nullptr);
  1042. EXPECT_NE(jieba_tokenizer1->AddWord("我们", -1), Status::OK());
  1043. }
  1044. TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddDict) {
  1045. // Testing AddDict of JiebaTokenizer when the input is a vector of word-freq pair.
  1046. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddDict.";
  1047. // Create a TextFile dataset
  1048. std::string data_file = datasets_root_path_ + "/testJiebaDataset/6.txt";
  1049. std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
  1050. std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
  1051. std::shared_ptr<Dataset> ds = TextFile({data_file});
  1052. EXPECT_NE(ds, nullptr);
  1053. // Create jieba_tokenizer operation on ds
  1054. std::shared_ptr<text::JiebaTokenizer> jieba_tokenizer =
  1055. std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp);
  1056. EXPECT_NE(jieba_tokenizer, nullptr);
  1057. // Add word with freq 20000
  1058. std::vector<std::pair<std::string, int64_t>> user_dict = {{"江大桥", 20000}};
  1059. ASSERT_OK(jieba_tokenizer->AddDict(user_dict));
  1060. // Create Map operation on ds
  1061. ds = ds->Map({jieba_tokenizer}, {"text"});
  1062. EXPECT_NE(ds, nullptr);
  1063. // Create an iterator over the result of the above dataset
  1064. // This will trigger the creation of the Execution Tree and launch it.
  1065. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  1066. EXPECT_NE(iter, nullptr);
  1067. // Iterate the dataset and get each row
  1068. std::unordered_map<std::string, mindspore::MSTensor> row;
  1069. ASSERT_OK(iter->GetNextRow(&row));
  1070. std::vector<std::string> expected = {"江州", "市长", "江大桥", "参加", "了", "长江大桥", "的", "通车", "仪式"};
  1071. std::shared_ptr<Tensor> de_expected_tensor;
  1072. ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
  1073. mindspore::MSTensor expected_tensor =
  1074. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  1075. uint64_t i = 0;
  1076. while (row.size() != 0) {
  1077. auto txt = row["text"];
  1078. EXPECT_MSTENSOR_EQ(txt, expected_tensor);
  1079. ASSERT_OK(iter->GetNextRow(&row));
  1080. i++;
  1081. }
  1082. EXPECT_EQ(i, 1);
  1083. // Manually terminate the pipeline
  1084. iter->Stop();
  1085. }
  1086. TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddDictFromFile) {
  1087. // Testing AddDict of JiebaTokenizer when the input is a path to dict.
  1088. // Test error scenario for AddDict: invalid path
  1089. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddDictFromFile.";
  1090. // Create a TextFile dataset
  1091. std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
  1092. std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
  1093. std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
  1094. std::shared_ptr<Dataset> ds = TextFile({data_file});
  1095. EXPECT_NE(ds, nullptr);
  1096. // Create jieba_tokenizer operation on ds
  1097. std::shared_ptr<text::JiebaTokenizer> jieba_tokenizer =
  1098. std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp);
  1099. EXPECT_NE(jieba_tokenizer, nullptr);
  1100. // Load dict from txt file
  1101. std::string user_dict_path = datasets_root_path_ + "/testJiebaDataset/user_dict.txt";
  1102. std::string invalid_path = datasets_root_path_ + "/testJiebaDataset/invalid_path.txt";
  1103. EXPECT_ERROR(jieba_tokenizer->AddDict(invalid_path));
  1104. ASSERT_OK(jieba_tokenizer->AddDict(user_dict_path));
  1105. // Create Map operation on ds
  1106. ds = ds->Map({jieba_tokenizer}, {"text"});
  1107. EXPECT_NE(ds, nullptr);
  1108. // Create an iterator over the result of the above dataset
  1109. // This will trigger the creation of the Execution Tree and launch it.
  1110. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  1111. EXPECT_NE(iter, nullptr);
  1112. // Iterate the dataset and get each row
  1113. std::unordered_map<std::string, mindspore::MSTensor> row;
  1114. ASSERT_OK(iter->GetNextRow(&row));
  1115. std::vector<std::string> expected = {"今天天气", "太好了", "我们", "一起", "去", "外面", "玩吧"};
  1116. std::shared_ptr<Tensor> de_expected_tensor;
  1117. ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
  1118. mindspore::MSTensor expected_tensor =
  1119. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  1120. uint64_t i = 0;
  1121. while (row.size() != 0) {
  1122. auto txt = row["text"];
  1123. EXPECT_MSTENSOR_EQ(txt, expected_tensor);
  1124. ASSERT_OK(iter->GetNextRow(&row));
  1125. i++;
  1126. }
  1127. EXPECT_EQ(i, 1);
  1128. // Manually terminate the pipeline
  1129. iter->Stop();
  1130. }
  1131. TEST_F(MindDataTestPipeline, TestSlidingWindowSuccess) {
  1132. // Testing the parameter of SlidingWindow interface when the axis is 0.
  1133. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSlidingWindowSuccess.";
  1134. // Create a TextFile dataset
  1135. std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
  1136. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  1137. EXPECT_NE(ds, nullptr);
  1138. // Create white_tokenizer operation on ds
  1139. std::shared_ptr<TensorTransform> white_tokenizer = std::make_shared<text::WhitespaceTokenizer>();
  1140. EXPECT_NE(white_tokenizer, nullptr);
  1141. // Create sliding_window operation on ds
  1142. std::shared_ptr<TensorTransform> sliding_window = std::make_shared<text::SlidingWindow>(3, 0);
  1143. EXPECT_NE(sliding_window, nullptr);
  1144. // Create Map operation on ds
  1145. ds = ds->Map({white_tokenizer, sliding_window}, {"text"});
  1146. EXPECT_NE(ds, nullptr);
  1147. // Create an iterator over the result of the above dataset
  1148. // This will trigger the creation of the Execution Tree and launch it.
  1149. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  1150. EXPECT_NE(iter, nullptr);
  1151. // Iterate the dataset and get each row
  1152. std::unordered_map<std::string, mindspore::MSTensor> row;
  1153. ASSERT_OK(iter->GetNextRow(&row));
  1154. std::vector<std::vector<std::string>> expected = {{"This", "is", "a", "is", "a", "text", "a", "text", "file."},
  1155. {"Be", "happy", "every", "happy", "every", "day."},
  1156. {"Good", "luck", "to", "luck", "to", "everyone."}};
  1157. uint64_t i = 0;
  1158. while (row.size() != 0) {
  1159. auto ind = row["text"];
  1160. std::shared_ptr<Tensor> de_expected_tensor;
  1161. int x = expected[i].size() / 3;
  1162. ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({x, 3}), &de_expected_tensor));
  1163. mindspore::MSTensor expected_tensor =
  1164. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  1165. EXPECT_MSTENSOR_EQ(ind, expected_tensor);
  1166. ASSERT_OK(iter->GetNextRow(&row));
  1167. i++;
  1168. }
  1169. EXPECT_EQ(i, 3);
  1170. // Manually terminate the pipeline
  1171. iter->Stop();
  1172. }
  1173. TEST_F(MindDataTestPipeline, TestSlidingWindowSuccess1) {
  1174. // Testing the parameter of SlidingWindow interface when the axis is -1.
  1175. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSlidingWindowSuccess1.";
  1176. // Create a TextFile dataset
  1177. std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
  1178. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  1179. EXPECT_NE(ds, nullptr);
  1180. // Create white_tokenizer operation on ds
  1181. std::shared_ptr<TensorTransform> white_tokenizer = std::make_shared<text::WhitespaceTokenizer>();
  1182. EXPECT_NE(white_tokenizer, nullptr);
  1183. // Create sliding_window operation on ds
  1184. std::shared_ptr<TensorTransform> sliding_window = std::make_shared<text::SlidingWindow>(2, -1);
  1185. EXPECT_NE(sliding_window, nullptr);
  1186. // Create Map operation on ds
  1187. ds = ds->Map({white_tokenizer, sliding_window}, {"text"});
  1188. EXPECT_NE(ds, nullptr);
  1189. // Create an iterator over the result of the above dataset
  1190. // This will trigger the creation of the Execution Tree and launch it.
  1191. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  1192. EXPECT_NE(iter, nullptr);
  1193. // Iterate the dataset and get each row
  1194. std::unordered_map<std::string, mindspore::MSTensor> row;
  1195. ASSERT_OK(iter->GetNextRow(&row));
  1196. std::vector<std::vector<std::string>> expected = {{"This", "is", "is", "a", "a", "text", "text", "file."},
  1197. {"Be", "happy", "happy", "every", "every", "day."},
  1198. {"Good", "luck", "luck", "to", "to", "everyone."}};
  1199. uint64_t i = 0;
  1200. while (row.size() != 0) {
  1201. auto ind = row["text"];
  1202. std::shared_ptr<Tensor> de_expected_tensor;
  1203. int x = expected[i].size() / 2;
  1204. ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({x, 2}), &de_expected_tensor));
  1205. mindspore::MSTensor expected_tensor =
  1206. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  1207. EXPECT_MSTENSOR_EQ(ind, expected_tensor);
  1208. ASSERT_OK(iter->GetNextRow(&row));
  1209. i++;
  1210. }
  1211. EXPECT_EQ(i, 3);
  1212. // Manually terminate the pipeline
  1213. iter->Stop();
  1214. }
  1215. TEST_F(MindDataTestPipeline, TestSlidingWindowFail1) {
  1216. // Testing the incorrect parameter of SlidingWindow interface.
  1217. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSlidingWindowFail1.";
  1218. // Create a TextFile dataset
  1219. std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
  1220. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  1221. EXPECT_NE(ds, nullptr);
  1222. // Create sliding_window operation on ds
  1223. // Testing the parameter width less than or equal to 0
  1224. // The parameter axis support 0 or -1 only for now
  1225. std::shared_ptr<TensorTransform> sliding_window = std::make_shared<text::SlidingWindow>(0, 0);
  1226. EXPECT_NE(sliding_window, nullptr);
  1227. // Create a Map operation on ds
  1228. ds = ds->Map({sliding_window});
  1229. EXPECT_NE(ds, nullptr);
  1230. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  1231. // Expect failure: invalid SlidingWindow input (width less than or equal to 0)
  1232. EXPECT_EQ(iter, nullptr);
  1233. }
  1234. TEST_F(MindDataTestPipeline, TestSlidingWindowFail2) {
  1235. // Testing the incorrect parameter of SlidingWindow interface.
  1236. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSlidingWindowFail2.";
  1237. // Create a TextFile dataset
  1238. std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
  1239. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  1240. EXPECT_NE(ds, nullptr);
  1241. // Create sliding_window operation on ds
  1242. // Testing the parameter width less than or equal to 0
  1243. // The parameter axis support 0 or -1 only for now
  1244. std::shared_ptr<TensorTransform> sliding_window = std::make_shared<text::SlidingWindow>(-2, 0);
  1245. EXPECT_NE(sliding_window, nullptr);
  1246. // Create a Map operation on ds
  1247. ds = ds->Map({sliding_window});
  1248. EXPECT_NE(ds, nullptr);
  1249. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  1250. // Expect failure: invalid SlidingWindow input (width less than or equal to 0)
  1251. EXPECT_EQ(iter, nullptr);
  1252. }
  1253. TEST_F(MindDataTestPipeline, TestToNumberSuccess1) {
  1254. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestToNumberSuccess1.";
  1255. // Test ToNumber with integer numbers
  1256. std::string data_file = datasets_root_path_ + "/testTokenizerData/to_number.txt";
  1257. // Create a TextFile dataset
  1258. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  1259. EXPECT_NE(ds, nullptr);
  1260. // Create a Take operation on ds
  1261. ds = ds->Take(8);
  1262. EXPECT_NE(ds, nullptr);
  1263. // Create ToNumber operation on ds
  1264. std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeInt64);
  1265. EXPECT_NE(to_number, nullptr);
  1266. // Create a Map operation on ds
  1267. ds = ds->Map({to_number}, {"text"});
  1268. EXPECT_NE(ds, nullptr);
  1269. // Create an iterator over the result of the above dataset
  1270. // This will trigger the creation of the Execution Tree and launch it.
  1271. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  1272. EXPECT_NE(iter, nullptr);
  1273. // Iterate the dataset and get each row
  1274. std::unordered_map<std::string, mindspore::MSTensor> row;
  1275. ASSERT_OK(iter->GetNextRow(&row));
  1276. std::vector<int64_t> expected = {-121, 14, -2219, 7623, -8162536, 162371864, -1726483716, 98921728421};
  1277. uint64_t i = 0;
  1278. while (row.size() != 0) {
  1279. auto ind = row["text"];
  1280. std::shared_ptr<Tensor> de_expected_tensor;
  1281. ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor));
  1282. mindspore::MSTensor ms_expected_tensor =
  1283. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  1284. EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor);
  1285. ASSERT_OK(iter->GetNextRow(&row));
  1286. i++;
  1287. }
  1288. EXPECT_EQ(i, 8);
  1289. // Manually terminate the pipeline
  1290. iter->Stop();
  1291. }
  1292. TEST_F(MindDataTestPipeline, TestToNumberSuccess2) {
  1293. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestToNumberSuccess2.";
  1294. // Test ToNumber with float numbers
  1295. std::string data_file = datasets_root_path_ + "/testTokenizerData/to_number.txt";
  1296. // Create a TextFile dataset
  1297. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  1298. EXPECT_NE(ds, nullptr);
  1299. // Create a Skip operation on ds
  1300. ds = ds->Skip(8);
  1301. EXPECT_NE(ds, nullptr);
  1302. // Create a Take operation on ds
  1303. ds = ds->Take(6);
  1304. EXPECT_NE(ds, nullptr);
  1305. // Create ToNumber operation on ds
  1306. std::shared_ptr<TensorTransform> to_number =
  1307. std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeFloat64);
  1308. EXPECT_NE(to_number, nullptr);
  1309. // Create a Map operation on ds
  1310. ds = ds->Map({to_number}, {"text"});
  1311. EXPECT_NE(ds, nullptr);
  1312. // Create an iterator over the result of the above dataset
  1313. // This will trigger the creation of the Execution Tree and launch it.
  1314. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  1315. EXPECT_NE(iter, nullptr);
  1316. // Iterate the dataset and get each row
  1317. std::unordered_map<std::string, mindspore::MSTensor> row;
  1318. ASSERT_OK(iter->GetNextRow(&row));
  1319. std::vector<double_t> expected = {-1.1, 1.4, -2219.321, 7623.453, -816256.234282, 162371864.243243};
  1320. uint64_t i = 0;
  1321. while (row.size() != 0) {
  1322. auto ind = row["text"];
  1323. std::shared_ptr<Tensor> de_expected_tensor;
  1324. ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor));
  1325. mindspore::MSTensor ms_expected_tensor =
  1326. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  1327. EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor);
  1328. ASSERT_OK(iter->GetNextRow(&row));
  1329. i++;
  1330. }
  1331. EXPECT_EQ(i, 6);
  1332. // Manually terminate the pipeline
  1333. iter->Stop();
  1334. }
  1335. TEST_F(MindDataTestPipeline, TestToNumberFail1) {
  1336. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestToNumberFail1.";
  1337. // Test ToNumber with overflow integer numbers
  1338. std::string data_file = datasets_root_path_ + "/testTokenizerData/to_number.txt";
  1339. // Create a TextFile dataset
  1340. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  1341. EXPECT_NE(ds, nullptr);
  1342. // Create a Skip operation on ds
  1343. ds = ds->Skip(2);
  1344. EXPECT_NE(ds, nullptr);
  1345. // Create a Take operation on ds
  1346. ds = ds->Take(6);
  1347. EXPECT_NE(ds, nullptr);
  1348. // Create ToNumber operation on ds
  1349. std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeInt8);
  1350. EXPECT_NE(to_number, nullptr);
  1351. // Create a Map operation on ds
  1352. ds = ds->Map({to_number}, {"text"});
  1353. EXPECT_NE(ds, nullptr);
  1354. // Create an iterator over the result of the above dataset
  1355. // This will trigger the creation of the Execution Tree and launch it.
  1356. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  1357. EXPECT_NE(iter, nullptr);
  1358. // Iterate the dataset and get each row
  1359. std::unordered_map<std::string, mindspore::MSTensor> row;
  1360. // Expect error: input out of bounds of int8
  1361. EXPECT_ERROR(iter->GetNextRow(&row));
  1362. uint64_t i = 0;
  1363. while (row.size() != 0) {
  1364. EXPECT_ERROR(iter->GetNextRow(&row));
  1365. i++;
  1366. }
  1367. // Expect failure: GetNextRow fail and return nothing
  1368. EXPECT_EQ(i, 0);
  1369. // Manually terminate the pipeline
  1370. iter->Stop();
  1371. }
  1372. TEST_F(MindDataTestPipeline, TestToNumberFail2) {
  1373. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestToNumberFail2.";
  1374. // Test ToNumber with overflow float numbers
  1375. std::string data_file = datasets_root_path_ + "/testTokenizerData/to_number.txt";
  1376. // Create a TextFile dataset
  1377. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  1378. EXPECT_NE(ds, nullptr);
  1379. // Create a Skip operation on ds
  1380. ds = ds->Skip(12);
  1381. EXPECT_NE(ds, nullptr);
  1382. // Create a Take operation on ds
  1383. ds = ds->Take(2);
  1384. EXPECT_NE(ds, nullptr);
  1385. // Create ToNumber operation on ds
  1386. std::shared_ptr<TensorTransform> to_number =
  1387. std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeFloat16);
  1388. EXPECT_NE(to_number, nullptr);
  1389. // Create a Map operation on ds
  1390. ds = ds->Map({to_number}, {"text"});
  1391. EXPECT_NE(ds, nullptr);
  1392. // Create an iterator over the result of the above dataset
  1393. // This will trigger the creation of the Execution Tree and launch it.
  1394. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  1395. EXPECT_NE(iter, nullptr);
  1396. // Iterate the dataset and get each row
  1397. std::unordered_map<std::string, mindspore::MSTensor> row;
  1398. // Expect error: input out of bounds of float16
  1399. EXPECT_ERROR(iter->GetNextRow(&row));
  1400. uint64_t i = 0;
  1401. while (row.size() != 0) {
  1402. EXPECT_ERROR(iter->GetNextRow(&row));
  1403. i++;
  1404. }
  1405. // Expect failure: GetNextRow fail and return nothing
  1406. EXPECT_EQ(i, 0);
  1407. // Manually terminate the pipeline
  1408. iter->Stop();
  1409. }
  1410. TEST_F(MindDataTestPipeline, TestToNumberFail3) {
  1411. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestToNumberFail3.";
  1412. // Test ToNumber with non numerical input
  1413. std::string data_file = datasets_root_path_ + "/testTokenizerData/to_number.txt";
  1414. // Create a TextFile dataset
  1415. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  1416. EXPECT_NE(ds, nullptr);
  1417. // Create a Skip operation on ds
  1418. ds = ds->Skip(14);
  1419. EXPECT_NE(ds, nullptr);
  1420. // Create ToNumber operation on ds
  1421. std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeInt64);
  1422. EXPECT_NE(to_number, nullptr);
  1423. // Create a Map operation on ds
  1424. ds = ds->Map({to_number}, {"text"});
  1425. EXPECT_NE(ds, nullptr);
  1426. // Create an iterator over the result of the above dataset
  1427. // This will trigger the creation of the Execution Tree and launch it.
  1428. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  1429. EXPECT_NE(iter, nullptr);
  1430. // Iterate the dataset and get each row
  1431. std::unordered_map<std::string, mindspore::MSTensor> row;
  1432. // Expect error: invalid input which is non numerical
  1433. EXPECT_ERROR(iter->GetNextRow(&row));
  1434. uint64_t i = 0;
  1435. while (row.size() != 0) {
  1436. EXPECT_ERROR(iter->GetNextRow(&row));
  1437. i++;
  1438. }
  1439. // Expect failure: GetNextRow fail and return nothing
  1440. EXPECT_EQ(i, 0);
  1441. // Manually terminate the pipeline
  1442. iter->Stop();
  1443. }
  1444. TEST_F(MindDataTestPipeline, TestToNumberFail4) {
  1445. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestToNumberFail4.";
  1446. // Test ToNumber with non numerical data type
  1447. std::string data_file = datasets_root_path_ + "/testTokenizerData/to_number.txt";
  1448. // Create a TextFile dataset
  1449. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  1450. EXPECT_NE(ds, nullptr);
  1451. // Create ToNumber operation on ds
  1452. std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kObjectTypeString);
  1453. EXPECT_NE(to_number, nullptr);
  1454. // Create a Map operation on ds
  1455. ds = ds->Map({to_number}, {"text"});
  1456. EXPECT_NE(ds, nullptr);
  1457. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  1458. // Expect failure: invalid parameter with non numerical data type
  1459. EXPECT_EQ(iter, nullptr);
  1460. }
  1461. TEST_F(MindDataTestPipeline, TestToNumberFail5) {
  1462. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestToNumberFail5.";
  1463. // Test ToNumber with non numerical data type
  1464. std::string data_file = datasets_root_path_ + "/testTokenizerData/to_number.txt";
  1465. // Create a TextFile dataset
  1466. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  1467. EXPECT_NE(ds, nullptr);
  1468. // Create ToNumber operation on ds
  1469. std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeBool);
  1470. EXPECT_NE(to_number, nullptr);
  1471. // Create a Map operation on ds
  1472. ds = ds->Map({to_number}, {"text"});
  1473. EXPECT_NE(ds, nullptr);
  1474. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  1475. // Expect failure: invalid parameter with non numerical data type
  1476. EXPECT_EQ(iter, nullptr);
  1477. }
  1478. TEST_F(MindDataTestPipeline, TestTruncateSequencePairSuccess1) {
  1479. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestTruncateSequencePairSuccess1.";
  1480. // Testing basic TruncateSequencePair
  1481. // Set seed for RandomDataset
  1482. auto original_seed = config::get_seed();
  1483. bool status_set_seed = config::set_seed(0);
  1484. EXPECT_EQ(status_set_seed, true);
  1485. // Set num_parallel_workers for RandomDataset
  1486. auto original_worker = config::get_num_parallel_workers();
  1487. bool status_set_worker = config::set_num_parallel_workers(1);
  1488. EXPECT_EQ(status_set_worker, true);
  1489. // Create a RandomDataset which has column names "col1" and "col2"
  1490. std::shared_ptr<SchemaObj> schema = Schema();
  1491. ASSERT_OK(schema->add_column("col1", mindspore::DataType::kNumberTypeInt16, {5}));
  1492. ASSERT_OK(schema->add_column("col2", mindspore::DataType::kNumberTypeInt32, {3}));
  1493. std::shared_ptr<Dataset> ds = RandomData(3, schema);
  1494. EXPECT_NE(ds, nullptr);
  1495. // Create a truncate_sequence_pair operation on ds
  1496. std::shared_ptr<TensorTransform> truncate_sequence_pair = std::make_shared<text::TruncateSequencePair>(4);
  1497. EXPECT_NE(truncate_sequence_pair, nullptr);
  1498. // Create Map operation on ds
  1499. ds = ds->Map({truncate_sequence_pair}, {"col1", "col2"});
  1500. EXPECT_NE(ds, nullptr);
  1501. // Create an iterator over the result of the above dataset
  1502. // This will trigger the creation of the Execution Tree and launch it.
  1503. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  1504. EXPECT_NE(iter, nullptr);
  1505. // Iterate the dataset and get each row
  1506. std::unordered_map<std::string, mindspore::MSTensor> row;
  1507. ASSERT_OK(iter->GetNextRow(&row));
  1508. std::vector<std::vector<int16_t>> expected1 = {{-29556, -29556}, {-18505, -18505}, {-25958, -25958}};
  1509. std::vector<std::vector<int32_t>> expected2 = {
  1510. {-1751672937, -1751672937}, {-656877352, -656877352}, {-606348325, -606348325}};
  1511. uint64_t i = 0;
  1512. while (row.size() != 0) {
  1513. auto ind1 = row["col1"];
  1514. auto ind2 = row["col2"];
  1515. std::shared_ptr<Tensor> de_expected_tensor1;
  1516. ASSERT_OK(Tensor::CreateFromVector(expected1[i], &de_expected_tensor1));
  1517. mindspore::MSTensor expected_tensor1 =
  1518. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor1));
  1519. EXPECT_MSTENSOR_EQ(ind1, expected_tensor1);
  1520. std::shared_ptr<Tensor> de_expected_tensor2;
  1521. ASSERT_OK(Tensor::CreateFromVector(expected2[i], &de_expected_tensor2));
  1522. mindspore::MSTensor expected_tensor2 =
  1523. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor2));
  1524. EXPECT_MSTENSOR_EQ(ind2, expected_tensor2);
  1525. ASSERT_OK(iter->GetNextRow(&row));
  1526. i++;
  1527. }
  1528. EXPECT_EQ(i, 3);
  1529. // Manually terminate the pipeline
  1530. iter->Stop();
  1531. // Restore original seed and num_parallel_workers
  1532. status_set_seed = config::set_seed(original_seed);
  1533. EXPECT_EQ(status_set_seed, true);
  1534. status_set_worker = config::set_num_parallel_workers(original_worker);
  1535. EXPECT_EQ(status_set_worker, true);
  1536. }
  1537. TEST_F(MindDataTestPipeline, TestTruncateSequencePairSuccess2) {
  1538. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestTruncateSequencePairSuccess2.";
  1539. // Testing basic TruncateSequencePair with odd max_length
  1540. // Set seed for RandomDataset
  1541. auto original_seed = config::get_seed();
  1542. bool status_set_seed = config::set_seed(1);
  1543. EXPECT_EQ(status_set_seed, true);
  1544. // Set num_parallel_workers for RandomDataset
  1545. auto original_worker = config::get_num_parallel_workers();
  1546. bool status_set_worker = config::set_num_parallel_workers(1);
  1547. EXPECT_EQ(status_set_worker, true);
  1548. // Create a RandomDataset which has column names "col1" and "col2"
  1549. std::shared_ptr<SchemaObj> schema = Schema();
  1550. ASSERT_OK(schema->add_column("col1", mindspore::DataType::kNumberTypeInt32, {4}));
  1551. ASSERT_OK(schema->add_column("col2", mindspore::DataType::kNumberTypeInt64, {4}));
  1552. std::shared_ptr<Dataset> ds = RandomData(4, schema);
  1553. EXPECT_NE(ds, nullptr);
  1554. // Create a truncate_sequence_pair operation on ds
  1555. std::shared_ptr<TensorTransform> truncate_sequence_pair = std::make_shared<text::TruncateSequencePair>(5);
  1556. EXPECT_NE(truncate_sequence_pair, nullptr);
  1557. // Create Map operation on ds
  1558. ds = ds->Map({truncate_sequence_pair}, {"col1", "col2"});
  1559. EXPECT_NE(ds, nullptr);
  1560. // Create an iterator over the result of the above dataset
  1561. // This will trigger the creation of the Execution Tree and launch it.
  1562. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  1563. EXPECT_NE(iter, nullptr);
  1564. // Iterate the dataset and get each row
  1565. std::unordered_map<std::string, mindspore::MSTensor> row;
  1566. ASSERT_OK(iter->GetNextRow(&row));
  1567. std::vector<std::vector<int32_t>> expected1 = {{1785358954, 1785358954, 1785358954},
  1568. {-1195853640, -1195853640, -1195853640},
  1569. {0, 0, 0},
  1570. {1296911693, 1296911693, 1296911693}};
  1571. std::vector<std::vector<int64_t>> expected2 = {
  1572. {-1, -1}, {-1229782938247303442, -1229782938247303442}, {2314885530818453536, 2314885530818453536}, {-1, -1}};
  1573. uint64_t i = 0;
  1574. while (row.size() != 0) {
  1575. auto ind1 = row["col1"];
  1576. auto ind2 = row["col2"];
  1577. std::shared_ptr<Tensor> de_expected_tensor1;
  1578. ASSERT_OK(Tensor::CreateFromVector(expected1[i], &de_expected_tensor1));
  1579. mindspore::MSTensor expected_tensor1 =
  1580. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor1));
  1581. EXPECT_MSTENSOR_EQ(ind1, expected_tensor1);
  1582. std::shared_ptr<Tensor> de_expected_tensor2;
  1583. ASSERT_OK(Tensor::CreateFromVector(expected2[i], &de_expected_tensor2));
  1584. mindspore::MSTensor expected_tensor2 =
  1585. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor2));
  1586. EXPECT_MSTENSOR_EQ(ind2, expected_tensor2);
  1587. ASSERT_OK(iter->GetNextRow(&row));
  1588. i++;
  1589. }
  1590. EXPECT_EQ(i, 4);
  1591. // Manually terminate the pipeline
  1592. iter->Stop();
  1593. // Restore original seed and num_parallel_workers
  1594. status_set_seed = config::set_seed(original_seed);
  1595. EXPECT_EQ(status_set_seed, true);
  1596. status_set_worker = config::set_num_parallel_workers(original_worker);
  1597. EXPECT_EQ(status_set_worker, true);
  1598. }
  1599. TEST_F(MindDataTestPipeline, TestTruncateSequencePairFail) {
  1600. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestTruncateSequencePairFail.";
  1601. // Testing TruncateSequencePair with negative max_length
  1602. // Create a RandomDataset which has column names "col1" and "col2"
  1603. std::shared_ptr<SchemaObj> schema = Schema();
  1604. ASSERT_OK(schema->add_column("col1", mindspore::DataType::kNumberTypeInt8, {3}));
  1605. ASSERT_OK(schema->add_column("col2", mindspore::DataType::kNumberTypeInt8, {3}));
  1606. std::shared_ptr<Dataset> ds = RandomData(3, schema);
  1607. EXPECT_NE(ds, nullptr);
  1608. // Create a truncate_sequence_pair operation on ds
  1609. std::shared_ptr<TensorTransform> truncate_sequence_pair = std::make_shared<text::TruncateSequencePair>(-1);
  1610. EXPECT_NE(truncate_sequence_pair, nullptr);
  1611. // Create a Map operation on ds
  1612. ds = ds->Map({truncate_sequence_pair});
  1613. EXPECT_NE(ds, nullptr);
  1614. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  1615. // Expect failure: invalid TruncateSequencePair input (invalid parameter with negative max_length)
  1616. EXPECT_EQ(iter, nullptr);
  1617. }
  1618. TEST_F(MindDataTestPipeline, TestNgramSuccess) {
  1619. // Testing the parameter of Ngram interface.
  1620. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNgramSuccess.";
  1621. // Create a TextFile dataset
  1622. std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
  1623. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  1624. EXPECT_NE(ds, nullptr);
  1625. // Create white_tokenizer operation on ds
  1626. std::shared_ptr<TensorTransform> white_tokenizer = std::make_shared<text::WhitespaceTokenizer>();
  1627. EXPECT_NE(white_tokenizer, nullptr);
  1628. // Create sliding_window operation on ds
  1629. std::shared_ptr<TensorTransform> ngram_op(new text::Ngram({2}, {"_", 1}, {"_", 1}, " "));
  1630. EXPECT_NE(ngram_op, nullptr);
  1631. // Create Map operation on ds
  1632. ds = ds->Map({white_tokenizer, ngram_op}, {"text"});
  1633. EXPECT_NE(ds, nullptr);
  1634. // Create an iterator over the result of the above dataset
  1635. // This will trigger the creation of the Execution Tree and launch it.
  1636. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  1637. EXPECT_NE(iter, nullptr);
  1638. // Iterate the dataset and get each row
  1639. std::unordered_map<std::string, mindspore::MSTensor> row;
  1640. ASSERT_OK(iter->GetNextRow(&row));
  1641. std::vector<std::vector<std::string>> expected = {{"_ This", "This is", "is a", "a text", "text file.", "file. _"},
  1642. {"_ Be", "Be happy", "happy every", "every day.", "day. _"},
  1643. {"_ Good", "Good luck", "luck to", "to everyone.", "everyone. _"}};
  1644. uint64_t i = 0;
  1645. while (row.size() != 0) {
  1646. auto ind = row["text"];
  1647. std::shared_ptr<Tensor> de_expected_tensor;
  1648. int x = expected[i].size();
  1649. ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({x}), &de_expected_tensor));
  1650. mindspore::MSTensor expected_tensor =
  1651. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  1652. EXPECT_MSTENSOR_EQ(ind, expected_tensor);
  1653. ASSERT_OK(iter->GetNextRow(&row));
  1654. i++;
  1655. }
  1656. EXPECT_EQ(i, 3);
  1657. // Manually terminate the pipeline
  1658. iter->Stop();
  1659. }
  1660. TEST_F(MindDataTestPipeline, TestNgramSuccess1) {
  1661. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNgramSuccess1.";
  1662. // Create a TextFile dataset
  1663. std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
  1664. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  1665. EXPECT_NE(ds, nullptr);
  1666. // Create white_tokenizer operation on ds
  1667. std::shared_ptr<TensorTransform> white_tokenizer = std::make_shared<text::WhitespaceTokenizer>();
  1668. EXPECT_NE(white_tokenizer, nullptr);
  1669. // Create sliding_window operation on ds
  1670. std::shared_ptr<TensorTransform> ngram_op(new text::Ngram({2, 3}, {"&", 2}, {"&", 2}, "-"));
  1671. EXPECT_NE(ngram_op, nullptr);
  1672. // Create Map operation on ds
  1673. ds = ds->Map({white_tokenizer, ngram_op}, {"text"});
  1674. EXPECT_NE(ds, nullptr);
  1675. // Create an iterator over the result of the above dataset
  1676. // This will trigger the creation of the Execution Tree and launch it.
  1677. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  1678. EXPECT_NE(iter, nullptr);
  1679. // Iterate the dataset and get each row
  1680. std::unordered_map<std::string, mindspore::MSTensor> row;
  1681. ASSERT_OK(iter->GetNextRow(&row));
  1682. std::vector<std::vector<std::string>> expected = {
  1683. {"&-This", "This-is", "is-a", "a-text", "text-file.", "file.-&", "&-&-This", "&-This-is", "This-is-a", "is-a-text",
  1684. "a-text-file.", "text-file.-&", "file.-&-&"},
  1685. {"&-Be", "Be-happy", "happy-every", "every-day.", "day.-&", "&-&-Be", "&-Be-happy", "Be-happy-every",
  1686. "happy-every-day.", "every-day.-&", "day.-&-&"},
  1687. {"&-Good", "Good-luck", "luck-to", "to-everyone.", "everyone.-&", "&-&-Good", "&-Good-luck", "Good-luck-to",
  1688. "luck-to-everyone.", "to-everyone.-&", "everyone.-&-&"}};
  1689. uint64_t i = 0;
  1690. while (row.size() != 0) {
  1691. auto ind = row["text"];
  1692. std::shared_ptr<Tensor> de_expected_tensor;
  1693. int x = expected[i].size();
  1694. ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({x}), &de_expected_tensor));
  1695. mindspore::MSTensor expected_tensor =
  1696. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  1697. EXPECT_MSTENSOR_EQ(ind, expected_tensor);
  1698. ASSERT_OK(iter->GetNextRow(&row));
  1699. i++;
  1700. }
  1701. EXPECT_EQ(i, 3);
  1702. // Manually terminate the pipeline
  1703. iter->Stop();
  1704. }
  1705. TEST_F(MindDataTestPipeline, TestNgramFail1) {
  1706. // Testing the incorrect parameter of Ngram interface.
  1707. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNgramFail1.";
  1708. // Create a TextFile dataset
  1709. std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
  1710. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  1711. EXPECT_NE(ds, nullptr);
  1712. // Create sliding_window operation on ds
  1713. // Testing the vector of ngram is empty
  1714. std::shared_ptr<TensorTransform> ngram_op(new text::Ngram({}));
  1715. EXPECT_NE(ngram_op, nullptr);
  1716. // Create a Map operation on ds
  1717. ds = ds->Map({ngram_op});
  1718. EXPECT_NE(ds, nullptr);
  1719. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  1720. // Expect failure: invalid Ngram input (the vector of ngram is empty)
  1721. EXPECT_EQ(iter, nullptr);
  1722. }
  1723. TEST_F(MindDataTestPipeline, TestNgramFail2) {
  1724. // Testing the incorrect parameter of Ngram interface.
  1725. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNgramFail2.";
  1726. // Create a TextFile dataset
  1727. std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
  1728. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  1729. EXPECT_NE(ds, nullptr);
  1730. // Create sliding_window operation on ds
  1731. // Testing the value of ngrams vector less than and equal to 0
  1732. std::shared_ptr<TensorTransform> ngram_op(new text::Ngram({0}));
  1733. EXPECT_NE(ngram_op, nullptr);
  1734. // Create a Map operation on ds
  1735. ds = ds->Map({ngram_op});
  1736. EXPECT_NE(ds, nullptr);
  1737. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  1738. // Expect failure: invalid Ngram input (the value of ngrams vector less than and equal to 0)
  1739. EXPECT_EQ(iter, nullptr);
  1740. }
  1741. TEST_F(MindDataTestPipeline, TestNgramFail3) {
  1742. // Testing the incorrect parameter of Ngram interface.
  1743. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNgramFail3.";
  1744. // Create a TextFile dataset
  1745. std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
  1746. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  1747. EXPECT_NE(ds, nullptr);
  1748. // Create sliding_window operation on ds
  1749. // Testing the value of ngrams vector less than and equal to 0
  1750. std::shared_ptr<TensorTransform> ngram_op(new text::Ngram({-2}));
  1751. EXPECT_NE(ngram_op, nullptr);
  1752. // Create a Map operation on ds
  1753. ds = ds->Map({ngram_op});
  1754. EXPECT_NE(ds, nullptr);
  1755. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  1756. // Expect failure: invalid Ngram input (the value of ngrams vector less than and equal to 0)
  1757. EXPECT_EQ(iter, nullptr);
  1758. }
  1759. TEST_F(MindDataTestPipeline, TestNgramFail4) {
  1760. // Testing the incorrect parameter of Ngram interface.
  1761. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNgramFail4.";
  1762. // Create a TextFile dataset
  1763. std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
  1764. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  1765. EXPECT_NE(ds, nullptr);
  1766. // Create sliding_window operation on ds
  1767. // Testing the second parameter pad_width in left_pad vector less than 0
  1768. std::shared_ptr<TensorTransform> ngram_op(new text::Ngram({2}, {"", -1}));
  1769. EXPECT_NE(ngram_op, nullptr);
  1770. // Create a Map operation on ds
  1771. ds = ds->Map({ngram_op});
  1772. EXPECT_NE(ds, nullptr);
  1773. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  1774. // Expect failure: invalid Ngram input (the second parameter pad_width in left_pad vector less than 0)
  1775. EXPECT_EQ(iter, nullptr);
  1776. }
  1777. TEST_F(MindDataTestPipeline, TestNgramFail5) {
  1778. // Testing the incorrect parameter of Ngram interface.
  1779. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNgramFail5.";
  1780. // Create a TextFile dataset
  1781. std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
  1782. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  1783. EXPECT_NE(ds, nullptr);
  1784. // Create sliding_window operation on ds
  1785. // Testing the second parameter pad_width in right_pad vector less than 0
  1786. std::shared_ptr<TensorTransform> ngram_op(new text::Ngram({2}, {"", 1}, {"", -1}));
  1787. EXPECT_NE(ngram_op, nullptr);
  1788. // Create a Map operation on ds
  1789. ds = ds->Map({ngram_op});
  1790. EXPECT_NE(ds, nullptr);
  1791. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  1792. // Expect failure: invalid Ngram input (the second parameter pad_width in left_pad vector less than 0)
  1793. EXPECT_EQ(iter, nullptr);
  1794. }
  1795. TEST_F(MindDataTestPipeline, TestNormalizeUTF8Success) {
  1796. // Testing the parameter of NormalizeUTF8 interface when the normalize_form is NormalizeForm::kNfkc.
  1797. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNormalizeUTF8Success.";
  1798. // Create a TextFile dataset
  1799. std::string data_file = datasets_root_path_ + "/testTokenizerData/normalize.txt";
  1800. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  1801. EXPECT_NE(ds, nullptr);
  1802. // Create normalizeutf8 operation on ds
  1803. std::shared_ptr<TensorTransform> normalizeutf8 = std::make_shared<text::NormalizeUTF8>(NormalizeForm::kNfkc);
  1804. EXPECT_NE(normalizeutf8, nullptr);
  1805. // Create Map operation on ds
  1806. ds = ds->Map({normalizeutf8}, {"text"});
  1807. EXPECT_NE(ds, nullptr);
  1808. // Create an iterator over the result of the above dataset
  1809. // This will trigger the creation of the Execution Tree and launch it.
  1810. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  1811. EXPECT_NE(iter, nullptr);
  1812. // Iterate the dataset and get each row
  1813. std::unordered_map<std::string, mindspore::MSTensor> row;
  1814. ASSERT_OK(iter->GetNextRow(&row));
  1815. std::vector<std::string> expected = {"ṩ", "ḍ̇", "q̣̇", "fi", "25", "ṩ"};
  1816. uint64_t i = 0;
  1817. while (row.size() != 0) {
  1818. auto ind = row["text"];
  1819. std::shared_ptr<Tensor> de_expected_tensor;
  1820. ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor));
  1821. mindspore::MSTensor ms_expected_tensor =
  1822. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  1823. EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor);
  1824. ASSERT_OK(iter->GetNextRow(&row));
  1825. i++;
  1826. }
  1827. EXPECT_EQ(i, 6);
  1828. // Manually terminate the pipeline
  1829. iter->Stop();
  1830. }
  1831. TEST_F(MindDataTestPipeline, TestNormalizeUTF8Success1) {
  1832. // Testing the parameter of NormalizeUTF8 interface when the normalize_form is NormalizeForm::kNfc.
  1833. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNormalizeUTF8Success1.";
  1834. // Create a TextFile dataset
  1835. std::string data_file = datasets_root_path_ + "/testTokenizerData/normalize.txt";
  1836. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  1837. EXPECT_NE(ds, nullptr);
  1838. // Create normalizeutf8 operation on ds
  1839. std::shared_ptr<TensorTransform> normalizeutf8 = std::make_shared<text::NormalizeUTF8>(NormalizeForm::kNfc);
  1840. EXPECT_NE(normalizeutf8, nullptr);
  1841. // Create Map operation on ds
  1842. ds = ds->Map({normalizeutf8}, {"text"});
  1843. EXPECT_NE(ds, nullptr);
  1844. // Create an iterator over the result of the above dataset
  1845. // This will trigger the creation of the Execution Tree and launch it.
  1846. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  1847. EXPECT_NE(iter, nullptr);
  1848. // Iterate the dataset and get each row
  1849. std::unordered_map<std::string, mindspore::MSTensor> row;
  1850. ASSERT_OK(iter->GetNextRow(&row));
  1851. std::vector<std::string> expected = {"ṩ", "ḍ̇", "q̣̇", "fi", "2⁵", "ẛ̣"};
  1852. uint64_t i = 0;
  1853. while (row.size() != 0) {
  1854. auto ind = row["text"];
  1855. std::shared_ptr<Tensor> de_expected_tensor;
  1856. ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor));
  1857. mindspore::MSTensor ms_expected_tensor =
  1858. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  1859. EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor);
  1860. ASSERT_OK(iter->GetNextRow(&row));
  1861. i++;
  1862. }
  1863. EXPECT_EQ(i, 6);
  1864. // Manually terminate the pipeline
  1865. iter->Stop();
  1866. }
  1867. TEST_F(MindDataTestPipeline, TestNormalizeUTF8Success2) {
  1868. // Testing the parameter of NormalizeUTF8 interface when the normalize_form is NormalizeForm::kNfd.
  1869. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNormalizeUTF8Success2.";
  1870. // Create a TextFile dataset
  1871. std::string data_file = datasets_root_path_ + "/testTokenizerData/normalize.txt";
  1872. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  1873. EXPECT_NE(ds, nullptr);
  1874. // Create normalizeutf8 operation on ds
  1875. std::shared_ptr<TensorTransform> normalizeutf8 = std::make_shared<text::NormalizeUTF8>(NormalizeForm::kNfd);
  1876. EXPECT_NE(normalizeutf8, nullptr);
  1877. // Create Map operation on ds
  1878. ds = ds->Map({normalizeutf8}, {"text"});
  1879. EXPECT_NE(ds, nullptr);
  1880. // Create an iterator over the result of the above dataset
  1881. // This will trigger the creation of the Execution Tree and launch it.
  1882. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  1883. EXPECT_NE(iter, nullptr);
  1884. // Iterate the dataset and get each row
  1885. std::unordered_map<std::string, mindspore::MSTensor> row;
  1886. ASSERT_OK(iter->GetNextRow(&row));
  1887. std::vector<std::string> expected = {"ṩ", "ḍ̇", "q̣̇", "fi", "2⁵", "ẛ̣"};
  1888. uint64_t i = 0;
  1889. while (row.size() != 0) {
  1890. auto ind = row["text"];
  1891. std::shared_ptr<Tensor> de_expected_tensor;
  1892. ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor));
  1893. mindspore::MSTensor ms_expected_tensor =
  1894. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  1895. EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor);
  1896. ASSERT_OK(iter->GetNextRow(&row));
  1897. i++;
  1898. }
  1899. EXPECT_EQ(i, 6);
  1900. // Manually terminate the pipeline
  1901. iter->Stop();
  1902. }
  1903. TEST_F(MindDataTestPipeline, TestNormalizeUTF8Success3) {
  1904. // Testing the parameter of NormalizeUTF8 interface when the normalize_form is NormalizeForm::kNfkd.
  1905. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNormalizeUTF8Success3.";
  1906. // Create a TextFile dataset
  1907. std::string data_file = datasets_root_path_ + "/testTokenizerData/normalize.txt";
  1908. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  1909. EXPECT_NE(ds, nullptr);
  1910. // Create normalizeutf8 operation on ds
  1911. std::shared_ptr<TensorTransform> normalizeutf8 = std::make_shared<text::NormalizeUTF8>(NormalizeForm::kNfkd);
  1912. EXPECT_NE(normalizeutf8, nullptr);
  1913. // Create Map operation on ds
  1914. ds = ds->Map({normalizeutf8}, {"text"});
  1915. EXPECT_NE(ds, nullptr);
  1916. // Create an iterator over the result of the above dataset
  1917. // This will trigger the creation of the Execution Tree and launch it.
  1918. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  1919. EXPECT_NE(iter, nullptr);
  1920. // Iterate the dataset and get each row
  1921. std::unordered_map<std::string, mindspore::MSTensor> row;
  1922. ASSERT_OK(iter->GetNextRow(&row));
  1923. std::vector<std::string> expected = {"ṩ", "ḍ̇", "q̣̇", "fi", "25", "ṩ"};
  1924. uint64_t i = 0;
  1925. while (row.size() != 0) {
  1926. auto ind = row["text"];
  1927. std::shared_ptr<Tensor> de_expected_tensor;
  1928. ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor));
  1929. mindspore::MSTensor ms_expected_tensor =
  1930. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  1931. EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor);
  1932. ASSERT_OK(iter->GetNextRow(&row));
  1933. i++;
  1934. }
  1935. EXPECT_EQ(i, 6);
  1936. // Manually terminate the pipeline
  1937. iter->Stop();
  1938. }
  1939. TEST_F(MindDataTestPipeline, TestRegexReplaceSuccess) {
  1940. // Testing the parameter of RegexReplace interface when the replace_all is true.
  1941. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRegexReplaceSuccess.";
  1942. // Create a TextFile dataset
  1943. std::string data_file = datasets_root_path_ + "/testTokenizerData/regex_replace.txt";
  1944. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  1945. EXPECT_NE(ds, nullptr);
  1946. // Create regex_replace operation on ds
  1947. std::shared_ptr<TensorTransform> regex_replace = std::make_shared<text::RegexReplace>("\\s+", "_", true);
  1948. EXPECT_NE(regex_replace, nullptr);
  1949. // Create Map operation on ds
  1950. ds = ds->Map({regex_replace}, {"text"});
  1951. EXPECT_NE(ds, nullptr);
  1952. // Create an iterator over the result of the above dataset
  1953. // This will trigger the creation of the Execution Tree and launch it.
  1954. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  1955. EXPECT_NE(iter, nullptr);
  1956. // Iterate the dataset and get each row
  1957. std::unordered_map<std::string, mindspore::MSTensor> row;
  1958. ASSERT_OK(iter->GetNextRow(&row));
  1959. std::vector<std::string> expected = {"Hello_World", "Let's_Go", "1:hello", "2:world",
  1960. "31:beijing", "Welcome_to_China!", "_我_不想_长大_", "Welcome_to_Shenzhen!"};
  1961. uint64_t i = 0;
  1962. while (row.size() != 0) {
  1963. auto ind = row["text"];
  1964. std::shared_ptr<Tensor> de_expected_tensor;
  1965. ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor));
  1966. mindspore::MSTensor ms_expected_tensor =
  1967. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  1968. EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor);
  1969. ASSERT_OK(iter->GetNextRow(&row));
  1970. i++;
  1971. }
  1972. EXPECT_EQ(i, 8);
  1973. // Manually terminate the pipeline
  1974. iter->Stop();
  1975. }
  1976. TEST_F(MindDataTestPipeline, TestRegexReplaceSuccess1) {
  1977. // Testing the parameter of RegexReplace interface when the replace_all is false.
  1978. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRegexReplaceSuccess1.";
  1979. // Create a TextFile dataset
  1980. std::string data_file = datasets_root_path_ + "/testTokenizerData/regex_replace.txt";
  1981. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  1982. EXPECT_NE(ds, nullptr);
  1983. // Create regex_replace operation on ds
  1984. std::shared_ptr<TensorTransform> regex_replace = std::make_shared<text::RegexReplace>("\\s+", "_", false);
  1985. EXPECT_NE(regex_replace, nullptr);
  1986. // Create Map operation on ds
  1987. ds = ds->Map({regex_replace}, {"text"});
  1988. EXPECT_NE(ds, nullptr);
  1989. // Create an iterator over the result of the above dataset
  1990. // This will trigger the creation of the Execution Tree and launch it.
  1991. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  1992. EXPECT_NE(iter, nullptr);
  1993. // Iterate the dataset and get each row
  1994. std::unordered_map<std::string, mindspore::MSTensor> row;
  1995. ASSERT_OK(iter->GetNextRow(&row));
  1996. std::vector<std::string> expected = {"Hello_World", "Let's_Go", "1:hello", "2:world",
  1997. "31:beijing", "Welcome_to China!", "_我 不想 长大 ", "Welcome_to Shenzhen!"};
  1998. uint64_t i = 0;
  1999. while (row.size() != 0) {
  2000. auto ind = row["text"];
  2001. std::shared_ptr<Tensor> de_expected_tensor;
  2002. ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor));
  2003. mindspore::MSTensor ms_expected_tensor =
  2004. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  2005. EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor);
  2006. ASSERT_OK(iter->GetNextRow(&row));
  2007. i++;
  2008. }
  2009. EXPECT_EQ(i, 8);
  2010. // Manually terminate the pipeline
  2011. iter->Stop();
  2012. }
  2013. TEST_F(MindDataTestPipeline, TestRegexTokenizerSuccess) {
  2014. // Testing the parameter of RegexTokenizer interface when the with_offsets is false.
  2015. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRegexTokenizerSuccess.";
  2016. // Create a TextFile dataset
  2017. std::string data_file = datasets_root_path_ + "/testTokenizerData/regex_replace.txt";
  2018. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  2019. EXPECT_NE(ds, nullptr);
  2020. // Create regex_tokenizer operation on ds
  2021. std::shared_ptr<TensorTransform> regex_tokenizer = std::make_shared<text::RegexTokenizer>("\\s+", "\\s+", false);
  2022. EXPECT_NE(regex_tokenizer, nullptr);
  2023. // Create Map operation on ds
  2024. ds = ds->Map({regex_tokenizer}, {"text"});
  2025. EXPECT_NE(ds, nullptr);
  2026. // Create an iterator over the result of the above dataset
  2027. // This will trigger the creation of the Execution Tree and launch it.
  2028. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  2029. EXPECT_NE(iter, nullptr);
  2030. // Iterate the dataset and get each row
  2031. std::unordered_map<std::string, mindspore::MSTensor> row;
  2032. ASSERT_OK(iter->GetNextRow(&row));
  2033. std::vector<std::vector<std::string>> expected = {{"Hello", " ", "World"},
  2034. {"Let's", " ", "Go"},
  2035. {"1:hello"},
  2036. {"2:world"},
  2037. {"31:beijing"},
  2038. {"Welcome", " ", "to", " ", "China!"},
  2039. {" ", "我", " ", "不想", " ", "长大", " "},
  2040. {"Welcome", " ", "to", " ", "Shenzhen!"}};
  2041. uint64_t i = 0;
  2042. while (row.size() != 0) {
  2043. auto ind = row["text"];
  2044. std::shared_ptr<Tensor> de_expected_tensor;
  2045. int x = expected[i].size();
  2046. ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({x}), &de_expected_tensor));
  2047. mindspore::MSTensor expected_tensor =
  2048. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  2049. EXPECT_MSTENSOR_EQ(ind, expected_tensor);
  2050. ASSERT_OK(iter->GetNextRow(&row));
  2051. i++;
  2052. }
  2053. EXPECT_EQ(i, 8);
  2054. // Manually terminate the pipeline
  2055. iter->Stop();
  2056. }
  2057. TEST_F(MindDataTestPipeline, TestRegexTokenizerSuccess1) {
  2058. // Testing the parameter of RegexTokenizer interface when the with_offsets is true.
  2059. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRegexTokenizerSuccess1.";
  2060. // Create a TextFile dataset
  2061. std::string data_file = datasets_root_path_ + "/testTokenizerData/regex_replace.txt";
  2062. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  2063. EXPECT_NE(ds, nullptr);
  2064. // Create regex_tokenizer operation on ds
  2065. std::shared_ptr<TensorTransform> regex_tokenizer = std::make_shared<text::RegexTokenizer>("\\s+", "\\s+", true);
  2066. EXPECT_NE(regex_tokenizer, nullptr);
  2067. // Create Map operation on ds
  2068. ds = ds->Map({regex_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"},
  2069. {"token", "offsets_start", "offsets_limit"});
  2070. EXPECT_NE(ds, nullptr);
  2071. // Create an iterator over the result of the above dataset
  2072. // This will trigger the creation of the Execution Tree and launch it.
  2073. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  2074. EXPECT_NE(iter, nullptr);
  2075. // Iterate the dataset and get each row
  2076. std::unordered_map<std::string, mindspore::MSTensor> row;
  2077. ASSERT_OK(iter->GetNextRow(&row));
  2078. std::vector<std::vector<std::string>> expected_tokens = {{"Hello", " ", "World"},
  2079. {"Let's", " ", "Go"},
  2080. {"1:hello"},
  2081. {"2:world"},
  2082. {"31:beijing"},
  2083. {"Welcome", " ", "to", " ", "China!"},
  2084. {" ", "我", " ", "不想", " ", "长大", " "},
  2085. {"Welcome", " ", "to", " ", "Shenzhen!"}};
  2086. std::vector<std::vector<uint32_t>> expected_offsets_start = {
  2087. {0, 5, 6}, {0, 5, 6}, {0}, {0}, {0}, {0, 7, 8, 10, 11}, {0, 2, 5, 6, 12, 14, 20}, {0, 7, 8, 10, 11}};
  2088. std::vector<std::vector<uint32_t>> expected_offsets_limit = {
  2089. {5, 6, 11}, {5, 6, 8}, {7}, {7}, {10}, {7, 8, 10, 11, 17}, {2, 5, 6, 12, 14, 20, 21}, {7, 8, 10, 11, 20}};
  2090. uint64_t i = 0;
  2091. while (row.size() != 0) {
  2092. auto token = row["token"];
  2093. auto start = row["offsets_start"];
  2094. auto limit = row["offsets_limit"];
  2095. std::shared_ptr<Tensor> de_expected_tokens;
  2096. int x = expected_tokens[i].size();
  2097. ASSERT_OK(Tensor::CreateFromVector(expected_tokens[i], TensorShape({x}), &de_expected_tokens));
  2098. mindspore::MSTensor ms_expected_tokens =
  2099. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tokens));
  2100. EXPECT_MSTENSOR_EQ(token, ms_expected_tokens);
  2101. std::shared_ptr<Tensor> de_expected_offsets_start;
  2102. ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start[i], TensorShape({x}), &de_expected_offsets_start));
  2103. mindspore::MSTensor ms_expected_offsets_start =
  2104. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_start));
  2105. EXPECT_MSTENSOR_EQ(start, ms_expected_offsets_start);
  2106. std::shared_ptr<Tensor> de_expected_offsets_limit;
  2107. ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit[i], TensorShape({x}), &de_expected_offsets_limit));
  2108. mindspore::MSTensor ms_expected_offsets_limit =
  2109. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_limit));
  2110. EXPECT_MSTENSOR_EQ(limit, ms_expected_offsets_limit);
  2111. ASSERT_OK(iter->GetNextRow(&row));
  2112. i++;
  2113. }
  2114. EXPECT_EQ(i, 8);
  2115. // Manually terminate the pipeline
  2116. iter->Stop();
  2117. }
  2118. TEST_F(MindDataTestPipeline, TestUnicodeCharTokenizerSuccess) {
  2119. // Testing the parameter of UnicodeCharTokenizer interface when the with_offsets is default.
  2120. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestUnicodeCharTokenizerSuccess.";
  2121. // Create a TextFile dataset
  2122. std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt";
  2123. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  2124. EXPECT_NE(ds, nullptr);
  2125. // Create unicodechar_tokenizer operation on ds
  2126. std::shared_ptr<TensorTransform> unicodechar_tokenizer = std::make_shared<text::UnicodeCharTokenizer>();
  2127. EXPECT_NE(unicodechar_tokenizer, nullptr);
  2128. // Create Map operation on ds
  2129. ds = ds->Map({unicodechar_tokenizer}, {"text"});
  2130. EXPECT_NE(ds, nullptr);
  2131. // Create an iterator over the result of the above dataset
  2132. // This will trigger the creation of the Execution Tree and launch it.
  2133. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  2134. EXPECT_NE(iter, nullptr);
  2135. // Iterate the dataset and get each row
  2136. std::unordered_map<std::string, mindspore::MSTensor> row;
  2137. ASSERT_OK(iter->GetNextRow(&row));
  2138. std::vector<std::vector<std::string>> expected = {
  2139. {"W", "e", "l", "c", "o", "m", "e", " ", "t", "o", " ", "B", "e", "i", "j", "i", "n", "g", "!"},
  2140. {"北", "京", "欢", "迎", "您", "!"},
  2141. {"我", "喜", "欢", "E", "n", "g", "l", "i", "s", "h", "!"},
  2142. {" ", " "}};
  2143. uint64_t i = 0;
  2144. while (row.size() != 0) {
  2145. auto ind = row["text"];
  2146. std::shared_ptr<Tensor> de_expected_tensor;
  2147. int x = expected[i].size();
  2148. ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({x}), &de_expected_tensor));
  2149. mindspore::MSTensor expected_tensor =
  2150. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  2151. EXPECT_MSTENSOR_EQ(ind, expected_tensor);
  2152. ASSERT_OK(iter->GetNextRow(&row));
  2153. i++;
  2154. }
  2155. EXPECT_EQ(i, 4);
  2156. // Manually terminate the pipeline
  2157. iter->Stop();
  2158. }
  2159. TEST_F(MindDataTestPipeline, TestUnicodeCharTokenizerSuccess1) {
  2160. // Testing the parameter of UnicodeCharTokenizer interface when the with_offsets is true.
  2161. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestUnicodeCharTokenizerSuccess1.";
  2162. // Create a TextFile dataset
  2163. std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt";
  2164. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  2165. EXPECT_NE(ds, nullptr);
  2166. // Create unicodechar_tokenizer operation on ds
  2167. std::shared_ptr<TensorTransform> unicodechar_tokenizer = std::make_shared<text::UnicodeCharTokenizer>(true);
  2168. EXPECT_NE(unicodechar_tokenizer, nullptr);
  2169. // Create Map operation on ds
  2170. ds = ds->Map({unicodechar_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"},
  2171. {"token", "offsets_start", "offsets_limit"});
  2172. EXPECT_NE(ds, nullptr);
  2173. // Create an iterator over the result of the above dataset
  2174. // This will trigger the creation of the Execution Tree and launch it.
  2175. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  2176. EXPECT_NE(iter, nullptr);
  2177. // Iterate the dataset and get each row
  2178. std::unordered_map<std::string, mindspore::MSTensor> row;
  2179. ASSERT_OK(iter->GetNextRow(&row));
  2180. std::vector<std::vector<std::string>> expected_tokens = {
  2181. {"W", "e", "l", "c", "o", "m", "e", " ", "t", "o", " ", "B", "e", "i", "j", "i", "n", "g", "!"},
  2182. {"北", "京", "欢", "迎", "您", "!"},
  2183. {"我", "喜", "欢", "E", "n", "g", "l", "i", "s", "h", "!"},
  2184. {" ", " "}};
  2185. std::vector<std::vector<uint32_t>> expected_offsets_start = {
  2186. {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18},
  2187. {0, 3, 6, 9, 12, 15},
  2188. {0, 3, 6, 9, 10, 11, 12, 13, 14, 15, 16},
  2189. {0, 1}};
  2190. std::vector<std::vector<uint32_t>> expected_offsets_limit = {
  2191. {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19},
  2192. {3, 6, 9, 12, 15, 18},
  2193. {3, 6, 9, 10, 11, 12, 13, 14, 15, 16, 17},
  2194. {1, 2}};
  2195. uint64_t i = 0;
  2196. while (row.size() != 0) {
  2197. auto token = row["token"];
  2198. auto start = row["offsets_start"];
  2199. auto limit = row["offsets_limit"];
  2200. std::shared_ptr<Tensor> de_expected_tokens;
  2201. int x = expected_tokens[i].size();
  2202. ASSERT_OK(Tensor::CreateFromVector(expected_tokens[i], TensorShape({x}), &de_expected_tokens));
  2203. mindspore::MSTensor ms_expected_tokens =
  2204. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tokens));
  2205. EXPECT_MSTENSOR_EQ(token, ms_expected_tokens);
  2206. std::shared_ptr<Tensor> de_expected_offsets_start;
  2207. ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start[i], TensorShape({x}), &de_expected_offsets_start));
  2208. mindspore::MSTensor ms_expected_offsets_start =
  2209. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_start));
  2210. EXPECT_MSTENSOR_EQ(start, ms_expected_offsets_start);
  2211. std::shared_ptr<Tensor> de_expected_offsets_limit;
  2212. ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit[i], TensorShape({x}), &de_expected_offsets_limit));
  2213. mindspore::MSTensor ms_expected_offsets_limit =
  2214. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_limit));
  2215. EXPECT_MSTENSOR_EQ(limit, ms_expected_offsets_limit);
  2216. ASSERT_OK(iter->GetNextRow(&row));
  2217. i++;
  2218. }
  2219. EXPECT_EQ(i, 4);
  2220. // Manually terminate the pipeline
  2221. iter->Stop();
  2222. }
  2223. std::vector<std::string> vocab_english = {"book", "cholera", "era", "favor", "##ite", "my",
  2224. "is", "love", "dur", "##ing", "the"};
  2225. std::vector<std::string> vocab_chinese = {"我", "最", "喜", "欢", "的", "书", "是", "霍", "乱", "时", "期", "爱", "情"};
  2226. TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess1) {
  2227. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess1.";
  2228. // Test WordpieceTokenizer with default parameters on English vocab
  2229. // Create a TextFile dataset
  2230. std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt";
  2231. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  2232. EXPECT_NE(ds, nullptr);
  2233. // Create Take operation on ds
  2234. ds = ds->Take(10);
  2235. EXPECT_NE(ds, nullptr);
  2236. // Create a vocab from vector
  2237. std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
  2238. Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab);
  2239. EXPECT_EQ(s, Status::OK());
  2240. // Create WordpieceTokenizer operation on ds
  2241. std::shared_ptr<TensorTransform> wordpiece_tokenizer = std::make_shared<text::WordpieceTokenizer>(vocab);
  2242. EXPECT_NE(wordpiece_tokenizer, nullptr);
  2243. // Create Map operation on ds
  2244. ds = ds->Map({wordpiece_tokenizer}, {"text"});
  2245. EXPECT_NE(ds, nullptr);
  2246. // Create an iterator over the result of the above dataset
  2247. // This will trigger the creation of the Execution Tree and launch it.
  2248. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  2249. EXPECT_NE(iter, nullptr);
  2250. // Iterate the dataset and get each row
  2251. std::unordered_map<std::string, mindspore::MSTensor> row;
  2252. ASSERT_OK(iter->GetNextRow(&row));
  2253. std::vector<std::vector<std::string>> expected = {
  2254. {"my"}, {"favor", "##ite"}, {"book"}, {"is"}, {"love"}, {"dur", "##ing"}, {"the"}, {"cholera"}, {"era"}, {"[UNK]"}};
  2255. uint64_t i = 0;
  2256. while (row.size() != 0) {
  2257. auto txt = row["text"];
  2258. std::shared_ptr<Tensor> de_expected_tensor;
  2259. ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
  2260. mindspore::MSTensor expected_tensor =
  2261. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  2262. EXPECT_MSTENSOR_EQ(txt, expected_tensor);
  2263. ASSERT_OK(iter->GetNextRow(&row));
  2264. i++;
  2265. }
  2266. EXPECT_EQ(i, 10);
  2267. // Manually terminate the pipeline
  2268. iter->Stop();
  2269. }
  2270. TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess2) {
  2271. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess2.";
  2272. // Test WordpieceTokenizer with empty unknown_token
  2273. // Create a TextFile dataset
  2274. std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt";
  2275. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  2276. EXPECT_NE(ds, nullptr);
  2277. // Create Take operation on ds
  2278. ds = ds->Take(10);
  2279. EXPECT_NE(ds, nullptr);
  2280. // Create a vocab from vector
  2281. std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
  2282. Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab);
  2283. EXPECT_EQ(s, Status::OK());
  2284. // Create WordpieceTokenizer operation on ds
  2285. std::shared_ptr<TensorTransform> wordpiece_tokenizer =
  2286. std::make_shared<text::WordpieceTokenizer>(vocab, "##", 100, "", false);
  2287. EXPECT_NE(wordpiece_tokenizer, nullptr);
  2288. // Create Map operation on ds
  2289. ds = ds->Map({wordpiece_tokenizer}, {"text"});
  2290. EXPECT_NE(ds, nullptr);
  2291. // Create an iterator over the result of the above dataset
  2292. // This will trigger the creation of the Execution Tree and launch it.
  2293. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  2294. EXPECT_NE(iter, nullptr);
  2295. // Iterate the dataset and get each row
  2296. std::unordered_map<std::string, mindspore::MSTensor> row;
  2297. ASSERT_OK(iter->GetNextRow(&row));
  2298. std::vector<std::vector<std::string>> expected = {
  2299. {"my"}, {"favor", "##ite"}, {"book"}, {"is"}, {"love"}, {"dur", "##ing"}, {"the"}, {"cholera"}, {"era"}, {"what"}};
  2300. uint64_t i = 0;
  2301. while (row.size() != 0) {
  2302. auto txt = row["text"];
  2303. std::shared_ptr<Tensor> de_expected_tensor;
  2304. ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
  2305. mindspore::MSTensor expected_tensor =
  2306. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  2307. EXPECT_MSTENSOR_EQ(txt, expected_tensor);
  2308. ASSERT_OK(iter->GetNextRow(&row));
  2309. i++;
  2310. }
  2311. EXPECT_EQ(i, 10);
  2312. // Manually terminate the pipeline
  2313. iter->Stop();
  2314. }
  2315. TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess3) {
  2316. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess3.";
  2317. // Test WordpieceTokenizer with non-default max_bytes_per_token
  2318. // Create a TextFile dataset
  2319. std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt";
  2320. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  2321. EXPECT_NE(ds, nullptr);
  2322. // Create Take operation on ds
  2323. ds = ds->Take(10);
  2324. EXPECT_NE(ds, nullptr);
  2325. // Create a vocab from vector
  2326. std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
  2327. Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab);
  2328. EXPECT_EQ(s, Status::OK());
  2329. // Create WordpieceTokenizer operation on ds
  2330. std::shared_ptr<TensorTransform> wordpiece_tokenizer =
  2331. std::make_shared<text::WordpieceTokenizer>(vocab, "##", 4, "[UNK]", false);
  2332. EXPECT_NE(wordpiece_tokenizer, nullptr);
  2333. // Create Map operation on ds
  2334. ds = ds->Map({wordpiece_tokenizer}, {"text"});
  2335. EXPECT_NE(ds, nullptr);
  2336. // Create an iterator over the result of the above dataset
  2337. // This will trigger the creation of the Execution Tree and launch it.
  2338. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  2339. EXPECT_NE(iter, nullptr);
  2340. // Iterate the dataset and get each row
  2341. std::unordered_map<std::string, mindspore::MSTensor> row;
  2342. ASSERT_OK(iter->GetNextRow(&row));
  2343. std::vector<std::vector<std::string>> expected = {{"my"}, {"[UNK]"}, {"book"}, {"is"}, {"love"},
  2344. {"[UNK]"}, {"the"}, {"[UNK]"}, {"era"}, {"[UNK]"}};
  2345. uint64_t i = 0;
  2346. while (row.size() != 0) {
  2347. auto txt = row["text"];
  2348. std::shared_ptr<Tensor> de_expected_tensor;
  2349. ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
  2350. mindspore::MSTensor expected_tensor =
  2351. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  2352. EXPECT_MSTENSOR_EQ(txt, expected_tensor);
  2353. ASSERT_OK(iter->GetNextRow(&row));
  2354. i++;
  2355. }
  2356. EXPECT_EQ(i, 10);
  2357. // Manually terminate the pipeline
  2358. iter->Stop();
  2359. }
  2360. TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess4) {
  2361. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess4.";
  2362. // Test WordpieceTokenizer with default parameters on Chinese vocab
  2363. // Create a TextFile dataset
  2364. std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt";
  2365. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  2366. EXPECT_NE(ds, nullptr);
  2367. // Create Skip operation on ds
  2368. ds = ds->Skip(10);
  2369. EXPECT_NE(ds, nullptr);
  2370. // Create Take operation on ds
  2371. ds = ds->Take(15);
  2372. EXPECT_NE(ds, nullptr);
  2373. // Create a vocab from vector
  2374. std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
  2375. Status s = Vocab::BuildFromVector(vocab_chinese, {}, true, &vocab);
  2376. EXPECT_EQ(s, Status::OK());
  2377. // Create WordpieceTokenizer operation on ds
  2378. std::shared_ptr<TensorTransform> wordpiece_tokenizer =
  2379. std::make_shared<text::WordpieceTokenizer>(vocab, "##", 100, "[UNK]", false);
  2380. EXPECT_NE(wordpiece_tokenizer, nullptr);
  2381. // Create Map operation on ds
  2382. ds = ds->Map({wordpiece_tokenizer}, {"text"});
  2383. EXPECT_NE(ds, nullptr);
  2384. // Create an iterator over the result of the above dataset
  2385. // This will trigger the creation of the Execution Tree and launch it.
  2386. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  2387. EXPECT_NE(iter, nullptr);
  2388. // Iterate the dataset and get each row
  2389. std::unordered_map<std::string, mindspore::MSTensor> row;
  2390. ASSERT_OK(iter->GetNextRow(&row));
  2391. std::vector<std::vector<std::string>> expected = {{"我"}, {"最"}, {"喜"}, {"欢"}, {"的"}, {"书"}, {"是"}, {"霍"},
  2392. {"乱"}, {"时"}, {"期"}, {"的"}, {"爱"}, {"情"}, {"[UNK]"}};
  2393. uint64_t i = 0;
  2394. while (row.size() != 0) {
  2395. auto txt = row["text"];
  2396. std::shared_ptr<Tensor> de_expected_tensor;
  2397. ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
  2398. mindspore::MSTensor expected_tensor =
  2399. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  2400. EXPECT_MSTENSOR_EQ(txt, expected_tensor);
  2401. ASSERT_OK(iter->GetNextRow(&row));
  2402. i++;
  2403. }
  2404. EXPECT_EQ(i, 15);
  2405. // Manually terminate the pipeline
  2406. iter->Stop();
  2407. }
  2408. TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess5) {
  2409. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess5.";
  2410. // Test WordpieceTokenizer with with_offsets true
  2411. // Create a TextFile dataset
  2412. std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt";
  2413. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  2414. EXPECT_NE(ds, nullptr);
  2415. // Create Take operation on ds
  2416. ds = ds->Take(10);
  2417. EXPECT_NE(ds, nullptr);
  2418. // Create a vocab from vector
  2419. std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
  2420. Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab);
  2421. EXPECT_EQ(s, Status::OK());
  2422. // Create WordpieceTokenizer operation on ds
  2423. std::shared_ptr<TensorTransform> wordpiece_tokenizer =
  2424. std::make_shared<text::WordpieceTokenizer>(vocab, "##", 100, "[UNK]", true);
  2425. EXPECT_NE(wordpiece_tokenizer, nullptr);
  2426. // Create Map operation on ds
  2427. ds = ds->Map({wordpiece_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"});
  2428. EXPECT_NE(ds, nullptr);
  2429. // Create an iterator over the result of the above dataset
  2430. // This will trigger the creation of the Execution Tree and launch it.
  2431. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  2432. EXPECT_NE(iter, nullptr);
  2433. // Iterate the dataset and get each row
  2434. std::unordered_map<std::string, mindspore::MSTensor> row;
  2435. ASSERT_OK(iter->GetNextRow(&row));
  2436. std::vector<std::vector<std::string>> expected = {
  2437. {"my"}, {"favor", "##ite"}, {"book"}, {"is"}, {"love"}, {"dur", "##ing"}, {"the"}, {"cholera"}, {"era"}, {"[UNK]"}};
  2438. std::vector<std::vector<uint32_t>> expected_offsets_start = {{0}, {0, 5}, {0}, {0}, {0}, {0, 3}, {0}, {0}, {0}, {0}};
  2439. std::vector<std::vector<uint32_t>> expected_offsets_limit = {{2}, {5, 8}, {4}, {2}, {4}, {3, 6}, {3}, {7}, {3}, {4}};
  2440. uint64_t i = 0;
  2441. while (row.size() != 0) {
  2442. auto txt = row["token"];
  2443. std::shared_ptr<Tensor> de_expected_tensor;
  2444. ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
  2445. mindspore::MSTensor expected_tensor =
  2446. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  2447. EXPECT_MSTENSOR_EQ(txt, expected_tensor);
  2448. auto start = row["offsets_start"];
  2449. std::shared_ptr<Tensor> de_expected_start_tensor;
  2450. ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start[i], &de_expected_start_tensor));
  2451. mindspore::MSTensor expected_start_tensor =
  2452. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_start_tensor));
  2453. EXPECT_MSTENSOR_EQ(start, expected_start_tensor);
  2454. auto limit = row["offsets_limit"];
  2455. std::shared_ptr<Tensor> de_expected_limit_tensor;
  2456. ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit[i], &de_expected_limit_tensor));
  2457. mindspore::MSTensor expected_limit_tensor =
  2458. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_limit_tensor));
  2459. EXPECT_MSTENSOR_EQ(limit, expected_limit_tensor);
  2460. ASSERT_OK(iter->GetNextRow(&row));
  2461. i++;
  2462. }
  2463. EXPECT_EQ(i, 10);
  2464. // Manually terminate the pipeline
  2465. iter->Stop();
  2466. }
  2467. TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess6) {
  2468. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess6.";
  2469. // Test WordpieceTokenizer with max_bytes_per_token equals to 0
  2470. // Create a TextFile dataset
  2471. std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt";
  2472. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  2473. EXPECT_NE(ds, nullptr);
  2474. // Create Take operation on ds
  2475. ds = ds->Take(10);
  2476. EXPECT_NE(ds, nullptr);
  2477. // Create a vocab from vector
  2478. std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
  2479. Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab);
  2480. EXPECT_EQ(s, Status::OK());
  2481. // Create WordpieceTokenizer operation on ds
  2482. std::shared_ptr<TensorTransform> wordpiece_tokenizer =
  2483. std::make_shared<text::WordpieceTokenizer>(vocab, "##", 0, "[UNK]", true);
  2484. EXPECT_NE(wordpiece_tokenizer, nullptr);
  2485. // Create Map operation on ds
  2486. ds = ds->Map({wordpiece_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"});
  2487. EXPECT_NE(ds, nullptr);
  2488. // Create an iterator over the result of the above dataset
  2489. // This will trigger the creation of the Execution Tree and launch it.
  2490. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  2491. EXPECT_NE(iter, nullptr);
  2492. // Iterate the dataset and get each row
  2493. std::unordered_map<std::string, mindspore::MSTensor> row;
  2494. ASSERT_OK(iter->GetNextRow(&row));
  2495. std::vector<std::vector<std::string>> expected = {{"[UNK]"}, {"[UNK]"}, {"[UNK]"}, {"[UNK]"}, {"[UNK]"},
  2496. {"[UNK]"}, {"[UNK]"}, {"[UNK]"}, {"[UNK]"}, {"[UNK]"}};
  2497. uint64_t i = 0;
  2498. while (row.size() != 0) {
  2499. auto txt = row["token"];
  2500. std::shared_ptr<Tensor> de_expected_tensor;
  2501. ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
  2502. mindspore::MSTensor expected_tensor =
  2503. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  2504. EXPECT_MSTENSOR_EQ(txt, expected_tensor);
  2505. ASSERT_OK(iter->GetNextRow(&row));
  2506. i++;
  2507. }
  2508. EXPECT_EQ(i, 10);
  2509. // Manually terminate the pipeline
  2510. iter->Stop();
  2511. }
  2512. TEST_F(MindDataTestPipeline, TestWordpieceTokenizerFail1) {
  2513. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerFail1.";
  2514. // Test WordpieceTokenizer with nullptr vocab
  2515. // Create a TextFile dataset
  2516. std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
  2517. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  2518. EXPECT_NE(ds, nullptr);
  2519. // Create WordpieceTokenizer operation on ds
  2520. std::shared_ptr<TensorTransform> wordpiece_tokenizer = std::make_shared<text::WordpieceTokenizer>(nullptr);
  2521. EXPECT_NE(wordpiece_tokenizer, nullptr);
  2522. // Create a Map operation on ds
  2523. ds = ds->Map({wordpiece_tokenizer});
  2524. EXPECT_NE(ds, nullptr);
  2525. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  2526. // Expect failure: invalid WordpieceTokenizer input with nullptr vocab
  2527. EXPECT_EQ(iter, nullptr);
  2528. }
  2529. TEST_F(MindDataTestPipeline, TestWordpieceTokenizerFail2) {
  2530. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerFail2.";
  2531. // Test WordpieceTokenizer with negative max_bytes_per_token
  2532. // Create a TextFile dataset
  2533. std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
  2534. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  2535. EXPECT_NE(ds, nullptr);
  2536. // Create a vocab from vector
  2537. std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
  2538. Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab);
  2539. EXPECT_EQ(s, Status::OK());
  2540. // Create WordpieceTokenizer operation on ds
  2541. std::shared_ptr<TensorTransform> wordpiece_tokenizer = std::make_shared<text::WordpieceTokenizer>(vocab, "##", -1);
  2542. EXPECT_NE(wordpiece_tokenizer, nullptr);
  2543. // Create a Map operation on ds
  2544. ds = ds->Map({wordpiece_tokenizer});
  2545. EXPECT_NE(ds, nullptr);
  2546. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  2547. // Expect failure: invalid WordpieceTokenizer input with nullptr vocab
  2548. EXPECT_EQ(iter, nullptr);
  2549. }
  2550. TEST_F(MindDataTestPipeline, TestUnicodeScriptTokenizerSuccess) {
  2551. // Testing the parameter of UnicodeScriptTokenizer interface when the with_offsets and the keep_whitespace is default.
  2552. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestUnicodeScriptTokenizerSuccess.";
  2553. // Create a TextFile dataset
  2554. std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt";
  2555. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  2556. EXPECT_NE(ds, nullptr);
  2557. // Create unicodescript_tokenizer operation on ds
  2558. std::shared_ptr<TensorTransform> unicodescript_tokenizer = std::make_shared<text::UnicodeScriptTokenizer>();
  2559. EXPECT_NE(unicodescript_tokenizer, nullptr);
  2560. // Create Map operation on ds
  2561. ds = ds->Map({unicodescript_tokenizer}, {"text"});
  2562. EXPECT_NE(ds, nullptr);
  2563. // Create an iterator over the result of the above dataset
  2564. // This will trigger the creation of the Execution Tree and launch it.
  2565. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  2566. EXPECT_NE(iter, nullptr);
  2567. // Iterate the dataset and get each row
  2568. std::unordered_map<std::string, mindspore::MSTensor> row;
  2569. ASSERT_OK(iter->GetNextRow(&row));
  2570. std::vector<std::vector<std::string>> expected = {
  2571. {"Welcome", "to", "Beijing", "!"}, {"北京欢迎您", "!"}, {"我喜欢", "English", "!"}, {""}};
  2572. uint64_t i = 0;
  2573. while (row.size() != 0) {
  2574. auto ind = row["text"];
  2575. std::shared_ptr<Tensor> de_expected_tensor;
  2576. int x = expected[i].size();
  2577. ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({x}), &de_expected_tensor));
  2578. mindspore::MSTensor expected_tensor =
  2579. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  2580. EXPECT_MSTENSOR_EQ(ind, expected_tensor);
  2581. ASSERT_OK(iter->GetNextRow(&row));
  2582. i++;
  2583. }
  2584. EXPECT_EQ(i, 4);
  2585. // Manually terminate the pipeline
  2586. iter->Stop();
  2587. }
  2588. TEST_F(MindDataTestPipeline, TestUnicodeScriptTokenizerSuccess1) {
  2589. // Testing the parameter of UnicodeScriptTokenizer interface when the keep_whitespace is true and the with_offsets is
  2590. // false.
  2591. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestUnicodeScriptTokenizerSuccess1.";
  2592. // Create a TextFile dataset
  2593. std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt";
  2594. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  2595. EXPECT_NE(ds, nullptr);
  2596. // Create unicodescript_tokenizer operation on ds
  2597. std::shared_ptr<TensorTransform> unicodescript_tokenizer = std::make_shared<text::UnicodeScriptTokenizer>(true);
  2598. EXPECT_NE(unicodescript_tokenizer, nullptr);
  2599. // Create Map operation on ds
  2600. ds = ds->Map({unicodescript_tokenizer}, {"text"});
  2601. EXPECT_NE(ds, nullptr);
  2602. // Create an iterator over the result of the above dataset
  2603. // This will trigger the creation of the Execution Tree and launch it.
  2604. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  2605. EXPECT_NE(iter, nullptr);
  2606. // Iterate the dataset and get each row
  2607. std::unordered_map<std::string, mindspore::MSTensor> row;
  2608. ASSERT_OK(iter->GetNextRow(&row));
  2609. std::vector<std::vector<std::string>> expected = {
  2610. {"Welcome", " ", "to", " ", "Beijing", "!"}, {"北京欢迎您", "!"}, {"我喜欢", "English", "!"}, {" "}};
  2611. uint64_t i = 0;
  2612. while (row.size() != 0) {
  2613. auto ind = row["text"];
  2614. std::shared_ptr<Tensor> de_expected_tensor;
  2615. int x = expected[i].size();
  2616. ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({x}), &de_expected_tensor));
  2617. mindspore::MSTensor expected_tensor =
  2618. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  2619. EXPECT_MSTENSOR_EQ(ind, expected_tensor);
  2620. ASSERT_OK(iter->GetNextRow(&row));
  2621. i++;
  2622. }
  2623. EXPECT_EQ(i, 4);
  2624. // Manually terminate the pipeline
  2625. iter->Stop();
  2626. }
  2627. TEST_F(MindDataTestPipeline, TestUnicodeScriptTokenizerSuccess2) {
  2628. // Testing the parameter of UnicodeScriptTokenizer interface when the keep_whitespace is false and the with_offsets is
  2629. // true.
  2630. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestUnicodeScriptTokenizerSuccess2.";
  2631. // Create a TextFile dataset
  2632. std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt";
  2633. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  2634. EXPECT_NE(ds, nullptr);
  2635. // Create unicodescript_tokenizer operation on ds
  2636. std::shared_ptr<TensorTransform> unicodescript_tokenizer =
  2637. std::make_shared<text::UnicodeScriptTokenizer>(false, true);
  2638. EXPECT_NE(unicodescript_tokenizer, nullptr);
  2639. // Create Map operation on ds
  2640. ds = ds->Map({unicodescript_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"},
  2641. {"token", "offsets_start", "offsets_limit"});
  2642. EXPECT_NE(ds, nullptr);
  2643. // Create an iterator over the result of the above dataset
  2644. // This will trigger the creation of the Execution Tree and launch it.
  2645. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  2646. EXPECT_NE(iter, nullptr);
  2647. // Iterate the dataset and get each row
  2648. std::unordered_map<std::string, mindspore::MSTensor> row;
  2649. ASSERT_OK(iter->GetNextRow(&row));
  2650. std::vector<std::vector<std::string>> expected_tokens = {
  2651. {"Welcome", "to", "Beijing", "!"}, {"北京欢迎您", "!"}, {"我喜欢", "English", "!"}, {""}};
  2652. std::vector<std::vector<uint32_t>> expected_offsets_start = {{0, 8, 11, 18}, {0, 15}, {0, 9, 16}, {0}};
  2653. std::vector<std::vector<uint32_t>> expected_offsets_limit = {{7, 10, 18, 19}, {15, 18}, {9, 16, 17}, {0}};
  2654. uint64_t i = 0;
  2655. while (row.size() != 0) {
  2656. auto token = row["token"];
  2657. auto start = row["offsets_start"];
  2658. auto limit = row["offsets_limit"];
  2659. std::shared_ptr<Tensor> de_expected_tokens;
  2660. int x = expected_tokens[i].size();
  2661. ASSERT_OK(Tensor::CreateFromVector(expected_tokens[i], TensorShape({x}), &de_expected_tokens));
  2662. mindspore::MSTensor ms_expected_tokens =
  2663. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tokens));
  2664. EXPECT_MSTENSOR_EQ(token, ms_expected_tokens);
  2665. std::shared_ptr<Tensor> de_expected_offsets_start;
  2666. ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start[i], TensorShape({x}), &de_expected_offsets_start));
  2667. mindspore::MSTensor ms_expected_offsets_start =
  2668. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_start));
  2669. EXPECT_MSTENSOR_EQ(start, ms_expected_offsets_start);
  2670. std::shared_ptr<Tensor> de_expected_offsets_limit;
  2671. ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit[i], TensorShape({x}), &de_expected_offsets_limit));
  2672. mindspore::MSTensor ms_expected_offsets_limit =
  2673. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_limit));
  2674. EXPECT_MSTENSOR_EQ(limit, ms_expected_offsets_limit);
  2675. ASSERT_OK(iter->GetNextRow(&row));
  2676. i++;
  2677. }
  2678. EXPECT_EQ(i, 4);
  2679. // Manually terminate the pipeline
  2680. iter->Stop();
  2681. }
  2682. TEST_F(MindDataTestPipeline, TestUnicodeScriptTokenizerSuccess3) {
  2683. // Testing the parameter of UnicodeScriptTokenizer interface when the keep_whitespace is true and the with_offsets is
  2684. // true.
  2685. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestUnicodeScriptTokenizerSuccess3.";
  2686. // Create a TextFile dataset
  2687. std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt";
  2688. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  2689. EXPECT_NE(ds, nullptr);
  2690. // Create unicodescript_tokenizer operation on ds
  2691. std::shared_ptr<TensorTransform> unicodescript_tokenizer = std::make_shared<text::UnicodeScriptTokenizer>(true, true);
  2692. EXPECT_NE(unicodescript_tokenizer, nullptr);
  2693. // Create Map operation on ds
  2694. ds = ds->Map({unicodescript_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"},
  2695. {"token", "offsets_start", "offsets_limit"});
  2696. EXPECT_NE(ds, nullptr);
  2697. // Create an iterator over the result of the above dataset
  2698. // This will trigger the creation of the Execution Tree and launch it.
  2699. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  2700. EXPECT_NE(iter, nullptr);
  2701. // Iterate the dataset and get each row
  2702. std::unordered_map<std::string, mindspore::MSTensor> row;
  2703. ASSERT_OK(iter->GetNextRow(&row));
  2704. std::vector<std::vector<std::string>> expected_tokens = {
  2705. {"Welcome", " ", "to", " ", "Beijing", "!"}, {"北京欢迎您", "!"}, {"我喜欢", "English", "!"}, {" "}};
  2706. std::vector<std::vector<uint32_t>> expected_offsets_start = {{0, 7, 8, 10, 11, 18}, {0, 15}, {0, 9, 16}, {0}};
  2707. std::vector<std::vector<uint32_t>> expected_offsets_limit = {{7, 8, 10, 11, 18, 19}, {15, 18}, {9, 16, 17}, {2}};
  2708. uint64_t i = 0;
  2709. while (row.size() != 0) {
  2710. auto token = row["token"];
  2711. auto start = row["offsets_start"];
  2712. auto limit = row["offsets_limit"];
  2713. std::shared_ptr<Tensor> de_expected_tokens;
  2714. int x = expected_tokens[i].size();
  2715. ASSERT_OK(Tensor::CreateFromVector(expected_tokens[i], TensorShape({x}), &de_expected_tokens));
  2716. mindspore::MSTensor ms_expected_tokens =
  2717. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tokens));
  2718. EXPECT_MSTENSOR_EQ(token, ms_expected_tokens);
  2719. std::shared_ptr<Tensor> de_expected_offsets_start;
  2720. ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start[i], TensorShape({x}), &de_expected_offsets_start));
  2721. mindspore::MSTensor ms_expected_offsets_start =
  2722. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_start));
  2723. EXPECT_MSTENSOR_EQ(start, ms_expected_offsets_start);
  2724. std::shared_ptr<Tensor> de_expected_offsets_limit;
  2725. ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit[i], TensorShape({x}), &de_expected_offsets_limit));
  2726. mindspore::MSTensor ms_expected_offsets_limit =
  2727. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_limit));
  2728. EXPECT_MSTENSOR_EQ(limit, ms_expected_offsets_limit);
  2729. ASSERT_OK(iter->GetNextRow(&row));
  2730. i++;
  2731. }
  2732. EXPECT_EQ(i, 4);
  2733. // Manually terminate the pipeline
  2734. iter->Stop();
  2735. }
  2736. TEST_F(MindDataTestPipeline, TestWhitespaceTokenizerSuccess) {
  2737. // Testing the parameter of WhitespaceTokenizer interface when the with_offsets is default.
  2738. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWhitespaceTokenizerSuccess.";
  2739. // Create a TextFile dataset
  2740. std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
  2741. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  2742. EXPECT_NE(ds, nullptr);
  2743. // Create white_tokenizer operation on ds
  2744. std::shared_ptr<TensorTransform> white_tokenizer = std::make_shared<text::WhitespaceTokenizer>();
  2745. EXPECT_NE(white_tokenizer, nullptr);
  2746. // Create Map operation on ds
  2747. ds = ds->Map({white_tokenizer}, {"text"});
  2748. EXPECT_NE(ds, nullptr);
  2749. // Create an iterator over the result of the above dataset
  2750. // This will trigger the creation of the Execution Tree and launch it.
  2751. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  2752. EXPECT_NE(iter, nullptr);
  2753. // Iterate the dataset and get each row
  2754. std::unordered_map<std::string, mindspore::MSTensor> row;
  2755. ASSERT_OK(iter->GetNextRow(&row));
  2756. std::vector<std::vector<std::string>> expected = {
  2757. {"This", "is", "a", "text", "file."}, {"Be", "happy", "every", "day."}, {"Good", "luck", "to", "everyone."}};
  2758. uint64_t i = 0;
  2759. while (row.size() != 0) {
  2760. auto ind = row["text"];
  2761. std::shared_ptr<Tensor> de_expected_tensor;
  2762. int x = expected[i].size();
  2763. ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({x}), &de_expected_tensor));
  2764. mindspore::MSTensor expected_tensor =
  2765. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
  2766. EXPECT_MSTENSOR_EQ(ind, expected_tensor);
  2767. ASSERT_OK(iter->GetNextRow(&row));
  2768. i++;
  2769. }
  2770. EXPECT_EQ(i, 3);
  2771. // Manually terminate the pipeline
  2772. iter->Stop();
  2773. }
  2774. TEST_F(MindDataTestPipeline, TestWhitespaceTokenizerSuccess1) {
  2775. // Testing the parameter of WhitespaceTokenizer interface when the with_offsets is true.
  2776. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWhitespaceTokenizerSuccess1.";
  2777. // Create a TextFile dataset
  2778. std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt";
  2779. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  2780. EXPECT_NE(ds, nullptr);
  2781. // Create white_tokenizer operation on ds
  2782. std::shared_ptr<TensorTransform> white_tokenizer = std::make_shared<text::WhitespaceTokenizer>(true);
  2783. EXPECT_NE(white_tokenizer, nullptr);
  2784. // Create Map operation on ds
  2785. ds = ds->Map({white_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"},
  2786. {"token", "offsets_start", "offsets_limit"});
  2787. EXPECT_NE(ds, nullptr);
  2788. // Create an iterator over the result of the above dataset
  2789. // This will trigger the creation of the Execution Tree and launch it.
  2790. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  2791. EXPECT_NE(iter, nullptr);
  2792. // Iterate the dataset and get each row
  2793. std::unordered_map<std::string, mindspore::MSTensor> row;
  2794. ASSERT_OK(iter->GetNextRow(&row));
  2795. std::vector<std::vector<std::string>> expected_tokens = {
  2796. {"Welcome", "to", "Beijing!"}, {"北京欢迎您!"}, {"我喜欢English!"}, {""}};
  2797. std::vector<std::vector<uint32_t>> expected_offsets_start = {{0, 8, 11}, {0}, {0}, {0}};
  2798. std::vector<std::vector<uint32_t>> expected_offsets_limit = {{7, 10, 19}, {18}, {17}, {0}};
  2799. uint64_t i = 0;
  2800. while (row.size() != 0) {
  2801. auto token = row["token"];
  2802. auto start = row["offsets_start"];
  2803. auto limit = row["offsets_limit"];
  2804. std::shared_ptr<Tensor> de_expected_tokens;
  2805. int x = expected_tokens[i].size();
  2806. ASSERT_OK(Tensor::CreateFromVector(expected_tokens[i], TensorShape({x}), &de_expected_tokens));
  2807. mindspore::MSTensor ms_expected_tokens =
  2808. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tokens));
  2809. EXPECT_MSTENSOR_EQ(token, ms_expected_tokens);
  2810. std::shared_ptr<Tensor> de_expected_offsets_start;
  2811. ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start[i], TensorShape({x}), &de_expected_offsets_start));
  2812. mindspore::MSTensor ms_expected_offsets_start =
  2813. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_start));
  2814. EXPECT_MSTENSOR_EQ(start, ms_expected_offsets_start);
  2815. std::shared_ptr<Tensor> de_expected_offsets_limit;
  2816. ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit[i], TensorShape({x}), &de_expected_offsets_limit));
  2817. mindspore::MSTensor ms_expected_offsets_limit =
  2818. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_limit));
  2819. EXPECT_MSTENSOR_EQ(limit, ms_expected_offsets_limit);
  2820. ASSERT_OK(iter->GetNextRow(&row));
  2821. i++;
  2822. }
  2823. EXPECT_EQ(i, 4);
  2824. // Manually terminate the pipeline
  2825. iter->Stop();
  2826. }
  2827. /// Feature: Vectors
  2828. /// Description: test with default parameter in function BuildFromFile and function Lookup
  2829. /// Expectation: return correct MSTensor which is equal to the expected
  2830. TEST_F(MindDataTestPipeline, TestVectorsDefaultParam) {
  2831. // Test with default parameter.
  2832. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsDefaultParam.";
  2833. // Create a TextFile dataset
  2834. std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
  2835. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  2836. EXPECT_NE(ds, nullptr);
  2837. std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors.txt";
  2838. std::shared_ptr<Vectors> vectors;
  2839. Status s = Vectors::BuildFromFile(&vectors, vectors_dir);
  2840. EXPECT_EQ(s, Status::OK());
  2841. std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(vectors);
  2842. EXPECT_NE(lookup, nullptr);
  2843. // Create Map operation on ds
  2844. ds = ds->Map({lookup}, {"text"});
  2845. EXPECT_NE(ds, nullptr);
  2846. // Create an iterator over the result of the above dataset
  2847. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  2848. EXPECT_NE(iter, nullptr);
  2849. // Iterate the dataset and get each row
  2850. std::unordered_map<std::string, mindspore::MSTensor> row;
  2851. ASSERT_OK(iter->GetNextRow(&row));
  2852. uint64_t i = 0;
  2853. std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
  2854. {0, 0, 0, 0, 0, 0},
  2855. {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
  2856. {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
  2857. {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
  2858. {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
  2859. {0, 0, 0, 0, 0, 0}};
  2860. while (row.size() != 0) {
  2861. auto ind = row["text"];
  2862. MS_LOG(INFO) << ind.Shape();
  2863. TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
  2864. TensorPtr de_expected_item;
  2865. dsize_t dim = 6;
  2866. ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
  2867. mindspore::MSTensor ms_expected_item =
  2868. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
  2869. EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
  2870. ASSERT_OK(iter->GetNextRow(&row));
  2871. i++;
  2872. }
  2873. EXPECT_EQ(i, 7);
  2874. // Manually terminate the pipeline
  2875. iter->Stop();
  2876. }
  2877. /// Feature: Vectors
  2878. /// Description: test with all parameters which include `path` and `max_vector` in function BuildFromFile
  2879. /// Expectation: return correct MSTensor which is equal to the expected
  2880. TEST_F(MindDataTestPipeline, TestVectorsAllBuildfromfileParams) {
  2881. // Test with two parameters.
  2882. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsAllBuildfromfileParams.";
  2883. // Create a TextFile dataset
  2884. std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
  2885. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  2886. EXPECT_NE(ds, nullptr);
  2887. std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors.txt";
  2888. std::shared_ptr<Vectors> vectors;
  2889. Status s = Vectors::BuildFromFile(&vectors, vectors_dir, 100);
  2890. EXPECT_EQ(s, Status::OK());
  2891. std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(vectors);
  2892. EXPECT_NE(lookup, nullptr);
  2893. // Create Map operation on ds
  2894. ds = ds->Map({lookup}, {"text"});
  2895. EXPECT_NE(ds, nullptr);
  2896. // Create an iterator over the result of the above dataset
  2897. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  2898. EXPECT_NE(iter, nullptr);
  2899. // Iterate the dataset and get each row
  2900. std::unordered_map<std::string, mindspore::MSTensor> row;
  2901. ASSERT_OK(iter->GetNextRow(&row));
  2902. uint64_t i = 0;
  2903. std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
  2904. {0, 0, 0, 0, 0, 0},
  2905. {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
  2906. {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
  2907. {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
  2908. {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
  2909. {0, 0, 0, 0, 0, 0}};
  2910. while (row.size() != 0) {
  2911. auto ind = row["text"];
  2912. MS_LOG(INFO) << ind.Shape();
  2913. TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
  2914. TensorPtr de_expected_item;
  2915. dsize_t dim = 6;
  2916. ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
  2917. mindspore::MSTensor ms_expected_item =
  2918. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
  2919. EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
  2920. ASSERT_OK(iter->GetNextRow(&row));
  2921. i++;
  2922. }
  2923. EXPECT_EQ(i, 7);
  2924. // Manually terminate the pipeline
  2925. iter->Stop();
  2926. }
  2927. /// Feature: Vectors
  2928. /// Description: test with all parameters in function BuildFromFile and `unknown_init` in function Lookup
  2929. /// Expectation: return correct MSTensor which is equal to the expected
  2930. TEST_F(MindDataTestPipeline, TestVectorsUnknownInit) {
  2931. // Test with two parameters.
  2932. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsUnknownInit.";
  2933. // Create a TextFile dataset
  2934. std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
  2935. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  2936. EXPECT_NE(ds, nullptr);
  2937. std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors.txt";
  2938. std::shared_ptr<Vectors> vectors;
  2939. Status s = Vectors::BuildFromFile(&vectors, vectors_dir, 100);
  2940. EXPECT_EQ(s, Status::OK());
  2941. std::vector<float> unknown_init = {-1, -1, -1, -1, -1, -1};
  2942. std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(vectors, unknown_init);
  2943. EXPECT_NE(lookup, nullptr);
  2944. // Create Map operation on ds
  2945. ds = ds->Map({lookup}, {"text"});
  2946. EXPECT_NE(ds, nullptr);
  2947. // Create an iterator over the result of the above dataset
  2948. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  2949. EXPECT_NE(iter, nullptr);
  2950. // Iterate the dataset and get each row
  2951. std::unordered_map<std::string, mindspore::MSTensor> row;
  2952. ASSERT_OK(iter->GetNextRow(&row));
  2953. uint64_t i = 0;
  2954. std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
  2955. {-1, -1, -1, -1, -1, -1},
  2956. {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
  2957. {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
  2958. {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
  2959. {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
  2960. {-1, -1, -1, -1, -1, -1}};
  2961. while (row.size() != 0) {
  2962. auto ind = row["text"];
  2963. MS_LOG(INFO) << ind.Shape();
  2964. TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
  2965. TensorPtr de_expected_item;
  2966. dsize_t dim = 6;
  2967. ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
  2968. mindspore::MSTensor ms_expected_item =
  2969. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
  2970. EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
  2971. ASSERT_OK(iter->GetNextRow(&row));
  2972. i++;
  2973. }
  2974. EXPECT_EQ(i, 7);
  2975. // Manually terminate the pipeline
  2976. iter->Stop();
  2977. }
  2978. /// Feature: Vectors
  2979. /// Description: test with all parameters which include `path` and `max_vectors` in function BuildFromFile and `token`,
  2980. /// `unknown_init` and `lower_case_backup` in function Lookup. But some tokens have some big letters
  2981. /// Expectation: return correct MSTensor which is equal to the expected
  2982. TEST_F(MindDataTestPipeline, TestVectorsAllParams) {
  2983. // Test with all parameters.
  2984. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsAllParams.";
  2985. // Create a TextFile dataset
  2986. std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
  2987. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  2988. EXPECT_NE(ds, nullptr);
  2989. std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors.txt";
  2990. std::shared_ptr<Vectors> vectors;
  2991. Status s = Vectors::BuildFromFile(&vectors, vectors_dir);
  2992. EXPECT_EQ(s, Status::OK());
  2993. std::vector<float> unknown_init = {-1, -1, -1, -1, -1, -1};
  2994. std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(vectors, unknown_init, true);
  2995. EXPECT_NE(lookup, nullptr);
  2996. // Create Map operation on ds
  2997. ds = ds->Map({lookup}, {"text"});
  2998. EXPECT_NE(ds, nullptr);
  2999. // Create an iterator over the result of the above dataset
  3000. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  3001. EXPECT_NE(iter, nullptr);
  3002. // Iterate the dataset and get each row
  3003. std::unordered_map<std::string, mindspore::MSTensor> row;
  3004. ASSERT_OK(iter->GetNextRow(&row));
  3005. uint64_t i = 0;
  3006. std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
  3007. {-1, -1, -1, -1, -1, -1},
  3008. {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
  3009. {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
  3010. {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
  3011. {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
  3012. {-1, -1, -1, -1, -1, -1}};
  3013. while (row.size() != 0) {
  3014. auto ind = row["text"];
  3015. MS_LOG(INFO) << ind.Shape();
  3016. TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
  3017. TensorPtr de_expected_item;
  3018. dsize_t dim = 6;
  3019. ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
  3020. mindspore::MSTensor ms_expected_item =
  3021. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
  3022. EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
  3023. ASSERT_OK(iter->GetNextRow(&row));
  3024. i++;
  3025. }
  3026. EXPECT_EQ(i, 7);
  3027. // Manually terminate the pipeline
  3028. iter->Stop();
  3029. }
  3030. /// Feature: Vectors
  3031. /// Description: test with pre-vectors set that have the different dimension
  3032. /// Expectation: throw correct error and message
  3033. TEST_F(MindDataTestPipeline, TestVectorsDifferentDimension) {
  3034. // Tokens don't have the same number of vectors.
  3035. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsDifferentDimension.";
  3036. // Create a TextFile dataset
  3037. std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
  3038. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3039. EXPECT_NE(ds, nullptr);
  3040. std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors_dim_different.txt";
  3041. std::shared_ptr<Vectors> vectors;
  3042. Status s = Vectors::BuildFromFile(&vectors, vectors_dir, 100);
  3043. EXPECT_NE(s, Status::OK());
  3044. }
  3045. /// Feature: Vectors
  3046. /// Description: test with pre-vectors set that has the head-info
  3047. /// Expectation: return correct MSTensor which is equal to the expected
  3048. TEST_F(MindDataTestPipeline, TestVectorsWithHeadInfo) {
  3049. // Test with words that has head info.
  3050. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsWithHeadInfo.";
  3051. // Create a TextFile dataset
  3052. std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
  3053. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3054. EXPECT_NE(ds, nullptr);
  3055. std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors_with_info.txt";
  3056. std::shared_ptr<Vectors> vectors;
  3057. Status s = Vectors::BuildFromFile(&vectors, vectors_dir);
  3058. EXPECT_EQ(s, Status::OK());
  3059. std::vector<float> unknown_init = {-1, -1, -1, -1, -1, -1};
  3060. std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(vectors, unknown_init, true);
  3061. EXPECT_NE(lookup, nullptr);
  3062. // Create Map operation on ds
  3063. ds = ds->Map({lookup}, {"text"});
  3064. EXPECT_NE(ds, nullptr);
  3065. // Create an iterator over the result of the above dataset
  3066. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  3067. EXPECT_NE(iter, nullptr);
  3068. // Iterate the dataset and get each row
  3069. std::unordered_map<std::string, mindspore::MSTensor> row;
  3070. ASSERT_OK(iter->GetNextRow(&row));
  3071. uint64_t i = 0;
  3072. std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
  3073. {-1, -1, -1, -1, -1, -1},
  3074. {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
  3075. {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
  3076. {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
  3077. {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
  3078. {-1, -1, -1, -1, -1, -1}};
  3079. while (row.size() != 0) {
  3080. auto ind = row["text"];
  3081. MS_LOG(INFO) << ind.Shape();
  3082. TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
  3083. TensorPtr de_expected_item;
  3084. dsize_t dim = 6;
  3085. ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
  3086. mindspore::MSTensor ms_expected_item =
  3087. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
  3088. EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
  3089. ASSERT_OK(iter->GetNextRow(&row));
  3090. i++;
  3091. }
  3092. EXPECT_EQ(i, 7);
  3093. // Manually terminate the pipeline
  3094. iter->Stop();
  3095. }
  3096. /// Feature: Vectors
  3097. /// Description: test with the parameter max_vectors that is <= 0
  3098. /// Expectation: throw correct error and message
  3099. TEST_F(MindDataTestPipeline, TestVectorsMaxVectorsLessThanZero) {
  3100. // Test with max_vectors <= 0.
  3101. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsMaxVectorsLessThanZero.";
  3102. // Create a TextFile dataset
  3103. std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
  3104. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3105. EXPECT_NE(ds, nullptr);
  3106. std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors.txt";
  3107. std::shared_ptr<Vectors> vectors;
  3108. Status s = Vectors::BuildFromFile(&vectors, vectors_dir, -1);
  3109. EXPECT_NE(s, Status::OK());
  3110. }
  3111. /// Feature: Vectors
  3112. /// Description: test with the pre-vectors file that is empty
  3113. /// Expectation: throw correct error and message
  3114. TEST_F(MindDataTestPipeline, TestVectorsWithEmptyFile) {
  3115. // Read empty file.
  3116. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsWithEmptyFile.";
  3117. // Create a TextFile dataset
  3118. std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
  3119. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3120. EXPECT_NE(ds, nullptr);
  3121. std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors_empty.txt";
  3122. std::shared_ptr<Vectors> vectors;
  3123. Status s = Vectors::BuildFromFile(&vectors, vectors_dir);
  3124. EXPECT_NE(s, Status::OK());
  3125. }
  3126. /// Feature: Vectors
  3127. /// Description: test with the pre-vectors file that is not exist
  3128. /// Expectation: throw correct error and message
  3129. TEST_F(MindDataTestPipeline, TestVectorsWithNotExistFile) {
  3130. // Test with not exist file.
  3131. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsWithNotExistFile.";
  3132. // Create a TextFile dataset
  3133. std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
  3134. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3135. EXPECT_NE(ds, nullptr);
  3136. std::string vectors_dir = datasets_root_path_ + "/testVectors/no_vectors.txt";
  3137. std::shared_ptr<Vectors> vectors;
  3138. Status s = Vectors::BuildFromFile(&vectors, vectors_dir);
  3139. EXPECT_NE(s, Status::OK());
  3140. }
  3141. /// Feature: Vectors
  3142. /// Description: test with the pre-vectors set that has a situation that info-head is not the first line in the set
  3143. /// Expectation: throw correct error and message
  3144. TEST_F(MindDataTestPipeline, TestVectorsWithWrongInfoFile) {
  3145. // Wrong info.
  3146. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsWithWrongInfoFile.";
  3147. // Create a TextFile dataset
  3148. std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
  3149. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3150. EXPECT_NE(ds, nullptr);
  3151. std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors_with_wrong_info.txt";
  3152. std::shared_ptr<Vectors> vectors;
  3153. Status s = Vectors::BuildFromFile(&vectors, vectors_dir);
  3154. EXPECT_NE(s, Status::OK());
  3155. }
  3156. /// Feature: FastText
  3157. /// Description: test with default parameter in function BuildFromFile and function Lookup
  3158. /// Expectation: return correct MSTensor which is equal to the expected
  3159. TEST_F(MindDataTestPipeline, TestFastTextDefaultParam) {
  3160. // Test with default parameter.
  3161. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextDefaultParam.";
  3162. // Create a TextFile dataset
  3163. std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
  3164. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3165. EXPECT_NE(ds, nullptr);
  3166. std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fast_text.vec";
  3167. std::shared_ptr<FastText> fast_text;
  3168. Status s = FastText::BuildFromFile(&fast_text, vectors_dir);
  3169. EXPECT_EQ(s, Status::OK());
  3170. std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(fast_text);
  3171. EXPECT_NE(lookup, nullptr);
  3172. // Create Map operation on ds
  3173. ds = ds->Map({lookup}, {"text"});
  3174. EXPECT_NE(ds, nullptr);
  3175. // Create an iterator over the result of the above dataset
  3176. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  3177. EXPECT_NE(iter, nullptr);
  3178. // Iterate the dataset and get each row
  3179. std::unordered_map<std::string, mindspore::MSTensor> row;
  3180. ASSERT_OK(iter->GetNextRow(&row));
  3181. uint64_t i = 0;
  3182. std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
  3183. {0, 0, 0, 0, 0, 0},
  3184. {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
  3185. {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
  3186. {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
  3187. {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
  3188. {0, 0, 0, 0, 0, 0}};
  3189. while (row.size() != 0) {
  3190. auto ind = row["text"];
  3191. MS_LOG(INFO) << ind.Shape();
  3192. TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
  3193. TensorPtr de_expected_item;
  3194. dsize_t dim = 6;
  3195. ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
  3196. mindspore::MSTensor ms_expected_item =
  3197. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
  3198. EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
  3199. ASSERT_OK(iter->GetNextRow(&row));
  3200. i++;
  3201. }
  3202. EXPECT_EQ(i, 7);
  3203. // Manually terminate the pipeline
  3204. iter->Stop();
  3205. }
  3206. /// Feature: FastText
  3207. /// Description: test with all parameters which include `path` and `max_vector` in function BuildFromFile
  3208. /// Expectation: return correct MSTensor which is equal to the expected
  3209. TEST_F(MindDataTestPipeline, TestFastTextAllBuildfromfileParams) {
  3210. // Test with two parameters.
  3211. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextAllBuildfromfileParams.";
  3212. // Create a TextFile dataset
  3213. std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
  3214. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3215. EXPECT_NE(ds, nullptr);
  3216. std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fast_text.vec";
  3217. std::shared_ptr<FastText> fast_text;
  3218. Status s = FastText::BuildFromFile(&fast_text, vectors_dir, 100);
  3219. EXPECT_EQ(s, Status::OK());
  3220. std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(fast_text);
  3221. EXPECT_NE(lookup, nullptr);
  3222. // Create Map operation on ds
  3223. ds = ds->Map({lookup}, {"text"});
  3224. EXPECT_NE(ds, nullptr);
  3225. // Create an iterator over the result of the above dataset
  3226. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  3227. EXPECT_NE(iter, nullptr);
  3228. // Iterate the dataset and get each row
  3229. std::unordered_map<std::string, mindspore::MSTensor> row;
  3230. ASSERT_OK(iter->GetNextRow(&row));
  3231. uint64_t i = 0;
  3232. std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
  3233. {0, 0, 0, 0, 0, 0},
  3234. {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
  3235. {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
  3236. {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
  3237. {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
  3238. {0, 0, 0, 0, 0, 0}};
  3239. while (row.size() != 0) {
  3240. auto ind = row["text"];
  3241. MS_LOG(INFO) << ind.Shape();
  3242. TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
  3243. TensorPtr de_expected_item;
  3244. dsize_t dim = 6;
  3245. ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
  3246. mindspore::MSTensor ms_expected_item =
  3247. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
  3248. EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
  3249. ASSERT_OK(iter->GetNextRow(&row));
  3250. i++;
  3251. }
  3252. EXPECT_EQ(i, 7);
  3253. // Manually terminate the pipeline
  3254. iter->Stop();
  3255. }
  3256. /// Feature: FastText
  3257. /// Description: test with all parameters in function BuildFromFile and `unknown_init` in function Lookup
  3258. /// Expectation: return correct MSTensor which is equal to the expected
  3259. TEST_F(MindDataTestPipeline, TestFastTextUnknownInit) {
  3260. // Test with two parameters.
  3261. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextUnknownInit.";
  3262. // Create a TextFile dataset
  3263. std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
  3264. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3265. EXPECT_NE(ds, nullptr);
  3266. std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fast_text.vec";
  3267. std::shared_ptr<FastText> fast_text;
  3268. Status s = FastText::BuildFromFile(&fast_text, vectors_dir, 100);
  3269. EXPECT_EQ(s, Status::OK());
  3270. std::vector<float> unknown_init = {-1, -1, -1, -1, -1, -1};
  3271. std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(fast_text, unknown_init);
  3272. EXPECT_NE(lookup, nullptr);
  3273. // Create Map operation on ds
  3274. ds = ds->Map({lookup}, {"text"});
  3275. EXPECT_NE(ds, nullptr);
  3276. // Create an iterator over the result of the above dataset
  3277. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  3278. EXPECT_NE(iter, nullptr);
  3279. // Iterate the dataset and get each row
  3280. std::unordered_map<std::string, mindspore::MSTensor> row;
  3281. ASSERT_OK(iter->GetNextRow(&row));
  3282. uint64_t i = 0;
  3283. std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
  3284. {-1, -1, -1, -1, -1, -1},
  3285. {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
  3286. {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
  3287. {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
  3288. {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
  3289. {-1, -1, -1, -1, -1, -1}};
  3290. while (row.size() != 0) {
  3291. auto ind = row["text"];
  3292. MS_LOG(INFO) << ind.Shape();
  3293. TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
  3294. TensorPtr de_expected_item;
  3295. dsize_t dim = 6;
  3296. ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
  3297. mindspore::MSTensor ms_expected_item =
  3298. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
  3299. EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
  3300. ASSERT_OK(iter->GetNextRow(&row));
  3301. i++;
  3302. }
  3303. EXPECT_EQ(i, 7);
  3304. // Manually terminate the pipeline
  3305. iter->Stop();
  3306. }
  3307. /// Feature: FastText
  3308. /// Description: test with all parameters which include `path` and `max_vectors` in function BuildFromFile and `token`,
  3309. /// `unknown_init` and `lower_case_backup` in function Lookup. But some tokens have some big letters
  3310. /// Expectation: return correct MSTensor which is equal to the expected
  3311. TEST_F(MindDataTestPipeline, TestFastTextAllParams) {
  3312. // Test with all parameters.
  3313. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextAllParams.";
  3314. // Create a TextFile dataset
  3315. std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
  3316. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3317. EXPECT_NE(ds, nullptr);
  3318. std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fast_text.vec";
  3319. std::shared_ptr<FastText> fast_text;
  3320. Status s = FastText::BuildFromFile(&fast_text, vectors_dir);
  3321. EXPECT_EQ(s, Status::OK());
  3322. std::vector<float> unknown_init = {-1, -1, -1, -1, -1, -1};
  3323. std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(fast_text, unknown_init, true);
  3324. EXPECT_NE(lookup, nullptr);
  3325. // Create Map operation on ds
  3326. ds = ds->Map({lookup}, {"text"});
  3327. EXPECT_NE(ds, nullptr);
  3328. // Create an iterator over the result of the above dataset
  3329. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  3330. EXPECT_NE(iter, nullptr);
  3331. // Iterate the dataset and get each row
  3332. std::unordered_map<std::string, mindspore::MSTensor> row;
  3333. ASSERT_OK(iter->GetNextRow(&row));
  3334. uint64_t i = 0;
  3335. std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
  3336. {-1, -1, -1, -1, -1, -1},
  3337. {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
  3338. {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
  3339. {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
  3340. {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
  3341. {-1, -1, -1, -1, -1, -1}};
  3342. while (row.size() != 0) {
  3343. auto ind = row["text"];
  3344. MS_LOG(INFO) << ind.Shape();
  3345. TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
  3346. TensorPtr de_expected_item;
  3347. dsize_t dim = 6;
  3348. ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
  3349. mindspore::MSTensor ms_expected_item =
  3350. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
  3351. EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
  3352. ASSERT_OK(iter->GetNextRow(&row));
  3353. i++;
  3354. }
  3355. EXPECT_EQ(i, 7);
  3356. // Manually terminate the pipeline
  3357. iter->Stop();
  3358. }
  3359. /// Feature: FastText
  3360. /// Description: test with pre-vectors set that have the different dimension
  3361. /// Expectation: throw correct error and message
  3362. TEST_F(MindDataTestPipeline, TestFastTextDifferentDimension) {
  3363. // Tokens don't have the same number of vectors.
  3364. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextDifferentDimension.";
  3365. // Create a TextFile dataset
  3366. std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
  3367. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3368. EXPECT_NE(ds, nullptr);
  3369. std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fasttext_dim_different.vec";
  3370. std::shared_ptr<FastText> fast_text;
  3371. Status s = FastText::BuildFromFile(&fast_text, vectors_dir, 100);
  3372. EXPECT_NE(s, Status::OK());
  3373. }
  3374. /// Feature: FastText
  3375. /// Description: test with the parameter max_vectors that is <= 0
  3376. /// Expectation: throw correct error and message
  3377. TEST_F(MindDataTestPipeline, TestFastTextMaxVectorsLessThanZero) {
  3378. // Test with max_vectors <= 0.
  3379. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextMaxVectorsLessThanZero.";
  3380. // Create a TextFile dataset
  3381. std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
  3382. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3383. EXPECT_NE(ds, nullptr);
  3384. std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fast_text.vec";
  3385. std::shared_ptr<FastText> fast_text;
  3386. Status s = FastText::BuildFromFile(&fast_text, vectors_dir, -1);
  3387. EXPECT_NE(s, Status::OK());
  3388. }
  3389. /// Feature: FastText
  3390. /// Description: test with the pre-vectors file that is empty
  3391. /// Expectation: throw correct error and message
  3392. TEST_F(MindDataTestPipeline, TestFastTextWithEmptyFile) {
  3393. // Read empty file.
  3394. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextWithEmptyFile.";
  3395. // Create a TextFile dataset
  3396. std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
  3397. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3398. EXPECT_NE(ds, nullptr);
  3399. std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fasttext_empty.vec";
  3400. std::shared_ptr<FastText> fast_text;
  3401. Status s = FastText::BuildFromFile(&fast_text, vectors_dir);
  3402. EXPECT_NE(s, Status::OK());
  3403. }
  3404. /// Feature: FastText
  3405. /// Description: test with the pre-vectors file that is not exist
  3406. /// Expectation: throw correct error and message
  3407. TEST_F(MindDataTestPipeline, TestFastTextWithNotExistFile) {
  3408. // Test with not exist file.
  3409. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextWithNotExistFile.";
  3410. // Create a TextFile dataset
  3411. std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
  3412. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3413. EXPECT_NE(ds, nullptr);
  3414. std::string vectors_dir = datasets_root_path_ + "/test_fast_text/no_fasttext.vec";
  3415. std::shared_ptr<FastText> fast_text;
  3416. Status s = FastText::BuildFromFile(&fast_text, vectors_dir);
  3417. EXPECT_NE(s, Status::OK());
  3418. }
  3419. /// Feature: FastText
  3420. /// Description: test with the pre-vectors set that has a situation that info-head is not the first line in the set
  3421. /// Expectation: throw correct error and message
  3422. TEST_F(MindDataTestPipeline, TestFastTextWithWrongInfoFile) {
  3423. // Wrong info.
  3424. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextWithWrongInfoFile.";
  3425. // Create a TextFile dataset
  3426. std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
  3427. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3428. EXPECT_NE(ds, nullptr);
  3429. std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fasttext_with_wrong_info.vec";
  3430. std::shared_ptr<FastText> fast_text;
  3431. Status s = FastText::BuildFromFile(&fast_text, vectors_dir);
  3432. EXPECT_NE(s, Status::OK());
  3433. }
  3434. /// Feature: FastText
  3435. /// Description: test with the pre-vectors set that has a wrong suffix
  3436. /// Expectation: throw correct error and message
  3437. TEST_F(MindDataTestPipeline, TestFastTextWithWrongSuffix) {
  3438. // Wrong info.
  3439. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextWithWrongSuffix.";
  3440. // Create a TextFile dataset
  3441. std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
  3442. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3443. EXPECT_NE(ds, nullptr);
  3444. std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fast_text.txt";
  3445. std::shared_ptr<FastText> fast_text;
  3446. Status s = FastText::BuildFromFile(&fast_text, vectors_dir);
  3447. EXPECT_NE(s, Status::OK());
  3448. }
  3449. /// Feature: GloVe
  3450. /// Description: test with default parameter in function BuildFromFile and function Lookup
  3451. /// Expectation: return correct MSTensor which is equal to the expected
  3452. TEST_F(MindDataTestPipeline, TestGloVeDefaultParam) {
  3453. // Test with default parameter.
  3454. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestGloVeDefaultParam.";
  3455. // Create a TextFile dataset
  3456. std::string data_file = datasets_root_path_ + "/testGloVe/words.txt";
  3457. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3458. EXPECT_NE(ds, nullptr);
  3459. std::string vectors_dir = datasets_root_path_ + "/testGloVe/glove.6B.test.txt";
  3460. std::shared_ptr<GloVe> glove;
  3461. Status s = GloVe::BuildFromFile(&glove, vectors_dir);
  3462. EXPECT_EQ(s, Status::OK());
  3463. std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(glove);
  3464. EXPECT_NE(lookup, nullptr);
  3465. // Create Map operation on ds
  3466. ds = ds->Map({lookup}, {"text"});
  3467. EXPECT_NE(ds, nullptr);
  3468. // Create an iterator over the result of the above dataset
  3469. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  3470. EXPECT_NE(iter, nullptr);
  3471. // Iterate the dataset and get each row
  3472. std::unordered_map<std::string, mindspore::MSTensor> row;
  3473. ASSERT_OK(iter->GetNextRow(&row));
  3474. uint64_t i = 0;
  3475. std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
  3476. {0, 0, 0, 0, 0, 0},
  3477. {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
  3478. {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
  3479. {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
  3480. {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
  3481. {0, 0, 0, 0, 0, 0}};
  3482. while (row.size() != 0) {
  3483. auto ind = row["text"];
  3484. MS_LOG(INFO) << ind.Shape();
  3485. TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
  3486. TensorPtr de_expected_item;
  3487. dsize_t dim = 6;
  3488. ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
  3489. mindspore::MSTensor ms_expected_item =
  3490. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
  3491. EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
  3492. ASSERT_OK(iter->GetNextRow(&row));
  3493. i++;
  3494. }
  3495. EXPECT_EQ(i, 7);
  3496. // Manually terminate the pipeline
  3497. iter->Stop();
  3498. }
  3499. /// Feature: GloVe
  3500. /// Description: test with all parameters which include `path` and `max_vector` in function BuildFromFile
  3501. /// Expectation: return correct MSTensor which is equal to the expected
  3502. TEST_F(MindDataTestPipeline, TestGloVeAllBuildfromfileParams) {
  3503. // Test with two parameters.
  3504. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestGloVeAllBuildfromfileParams.";
  3505. // Create a TextFile dataset
  3506. std::string data_file = datasets_root_path_ + "/testGloVe/words.txt";
  3507. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3508. EXPECT_NE(ds, nullptr);
  3509. std::string vectors_dir = datasets_root_path_ + "/testGloVe/glove.6B.test.txt";
  3510. std::shared_ptr<GloVe> glove;
  3511. Status s = GloVe::BuildFromFile(&glove, vectors_dir, 100);
  3512. EXPECT_EQ(s, Status::OK());
  3513. std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(glove);
  3514. EXPECT_NE(lookup, nullptr);
  3515. // Create Map operation on ds
  3516. ds = ds->Map({lookup}, {"text"});
  3517. EXPECT_NE(ds, nullptr);
  3518. // Create an iterator over the result of the above dataset
  3519. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  3520. EXPECT_NE(iter, nullptr);
  3521. // Iterate the dataset and get each row
  3522. std::unordered_map<std::string, mindspore::MSTensor> row;
  3523. ASSERT_OK(iter->GetNextRow(&row));
  3524. uint64_t i = 0;
  3525. std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
  3526. {0, 0, 0, 0, 0, 0},
  3527. {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
  3528. {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
  3529. {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
  3530. {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
  3531. {0, 0, 0, 0, 0, 0}};
  3532. while (row.size() != 0) {
  3533. auto ind = row["text"];
  3534. MS_LOG(INFO) << ind.Shape();
  3535. TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
  3536. TensorPtr de_expected_item;
  3537. dsize_t dim = 6;
  3538. ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
  3539. mindspore::MSTensor ms_expected_item =
  3540. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
  3541. EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
  3542. ASSERT_OK(iter->GetNextRow(&row));
  3543. i++;
  3544. }
  3545. EXPECT_EQ(i, 7);
  3546. // Manually terminate the pipeline
  3547. iter->Stop();
  3548. }
  3549. /// Feature: GloVe
  3550. /// Description: test with all parameters in function BuildFromFile and `unknown_init` in function Lookup
  3551. /// Expectation: return correct MSTensor which is equal to the expected
  3552. TEST_F(MindDataTestPipeline, TestGloVeUnknownInit) {
  3553. // Test with two parameters.
  3554. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestGloVeUnknownInit.";
  3555. // Create a TextFile dataset
  3556. std::string data_file = datasets_root_path_ + "/testGloVe/words.txt";
  3557. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3558. EXPECT_NE(ds, nullptr);
  3559. std::string vectors_dir = datasets_root_path_ + "/testGloVe/glove.6B.test.txt";
  3560. std::shared_ptr<GloVe> glove;
  3561. Status s = GloVe::BuildFromFile(&glove, vectors_dir, 100);
  3562. EXPECT_EQ(s, Status::OK());
  3563. std::vector<float> unknown_init = {-1, -1, -1, -1, -1, -1};
  3564. std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(glove, unknown_init);
  3565. EXPECT_NE(lookup, nullptr);
  3566. // Create Map operation on ds
  3567. ds = ds->Map({lookup}, {"text"});
  3568. EXPECT_NE(ds, nullptr);
  3569. // Create an iterator over the result of the above dataset
  3570. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  3571. EXPECT_NE(iter, nullptr);
  3572. // Iterate the dataset and get each row
  3573. std::unordered_map<std::string, mindspore::MSTensor> row;
  3574. ASSERT_OK(iter->GetNextRow(&row));
  3575. uint64_t i = 0;
  3576. std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
  3577. {-1, -1, -1, -1, -1, -1},
  3578. {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
  3579. {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
  3580. {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
  3581. {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
  3582. {-1, -1, -1, -1, -1, -1}};
  3583. while (row.size() != 0) {
  3584. auto ind = row["text"];
  3585. MS_LOG(INFO) << ind.Shape();
  3586. TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
  3587. TensorPtr de_expected_item;
  3588. dsize_t dim = 6;
  3589. ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
  3590. mindspore::MSTensor ms_expected_item =
  3591. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
  3592. EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
  3593. ASSERT_OK(iter->GetNextRow(&row));
  3594. i++;
  3595. }
  3596. EXPECT_EQ(i, 7);
  3597. // Manually terminate the pipeline
  3598. iter->Stop();
  3599. }
  3600. /// Feature: GloVe
  3601. /// Description: test with all parameters which include `path` and `max_vectors` in function BuildFromFile and `token`,
  3602. /// `unknown_init` and `lower_case_backup` in function Lookup. But some tokens have some big letters
  3603. /// Expectation: return correct MSTensor which is equal to the expected
  3604. TEST_F(MindDataTestPipeline, TestGloVeAllParams) {
  3605. // Test with all parameters.
  3606. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestGloVeAllParams.";
  3607. // Create a TextFile dataset
  3608. std::string data_file = datasets_root_path_ + "/testGloVe/words.txt";
  3609. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3610. EXPECT_NE(ds, nullptr);
  3611. std::string vectors_dir = datasets_root_path_ + "/testGloVe/glove.6B.test.txt";
  3612. std::shared_ptr<GloVe> glove;
  3613. Status s = GloVe::BuildFromFile(&glove, vectors_dir);
  3614. EXPECT_EQ(s, Status::OK());
  3615. std::vector<float> unknown_init = {-1, -1, -1, -1, -1, -1};
  3616. std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(glove, unknown_init, true);
  3617. EXPECT_NE(lookup, nullptr);
  3618. // Create Map operation on ds
  3619. ds = ds->Map({lookup}, {"text"});
  3620. EXPECT_NE(ds, nullptr);
  3621. // Create an iterator over the result of the above dataset
  3622. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  3623. EXPECT_NE(iter, nullptr);
  3624. // Iterate the dataset and get each row
  3625. std::unordered_map<std::string, mindspore::MSTensor> row;
  3626. ASSERT_OK(iter->GetNextRow(&row));
  3627. uint64_t i = 0;
  3628. std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
  3629. {-1, -1, -1, -1, -1, -1},
  3630. {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
  3631. {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
  3632. {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
  3633. {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
  3634. {-1, -1, -1, -1, -1, -1}};
  3635. while (row.size() != 0) {
  3636. auto ind = row["text"];
  3637. MS_LOG(INFO) << ind.Shape();
  3638. TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
  3639. TensorPtr de_expected_item;
  3640. dsize_t dim = 6;
  3641. ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
  3642. mindspore::MSTensor ms_expected_item =
  3643. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
  3644. EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
  3645. ASSERT_OK(iter->GetNextRow(&row));
  3646. i++;
  3647. }
  3648. EXPECT_EQ(i, 7);
  3649. // Manually terminate the pipeline
  3650. iter->Stop();
  3651. }
  3652. /// Feature: GloVe
  3653. /// Description: test with pre-vectors set that have the different dimension
  3654. /// Expectation: throw correct error and message
  3655. TEST_F(MindDataTestPipeline, TestGloVeDifferentDimension) {
  3656. // Tokens don't have the same number of glove.
  3657. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestGloVeDifferentDimension.";
  3658. // Create a TextFile dataset
  3659. std::string data_file = datasets_root_path_ + "/testGloVe/words.txt";
  3660. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3661. EXPECT_NE(ds, nullptr);
  3662. std::string vectors_dir = datasets_root_path_ + "/testGloVe/glove.6B.dim_different.txt";
  3663. std::shared_ptr<GloVe> glove;
  3664. Status s = GloVe::BuildFromFile(&glove, vectors_dir, 100);
  3665. EXPECT_NE(s, Status::OK());
  3666. }
  3667. /// Feature: GloVe
  3668. /// Description: test with the parameter max_vectors that is <= 0
  3669. /// Expectation: throw correct error and message
  3670. TEST_F(MindDataTestPipeline, TestGloVeMaxVectorsLessThanZero) {
  3671. // Test with max_vectors <= 0.
  3672. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestGloVeMaxVectorsLessThanZero.";
  3673. // Create a TextFile dataset
  3674. std::string data_file = datasets_root_path_ + "/testGloVe/words.txt";
  3675. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3676. EXPECT_NE(ds, nullptr);
  3677. std::string vectors_dir = datasets_root_path_ + "/testGloVe/glove.6B.test.txt";
  3678. std::shared_ptr<GloVe> glove;
  3679. Status s = GloVe::BuildFromFile(&glove, vectors_dir, -1);
  3680. EXPECT_NE(s, Status::OK());
  3681. }
  3682. /// Feature: GloVe
  3683. /// Description: test with the pre-vectors file that is empty
  3684. /// Expectation: throw correct error and message
  3685. TEST_F(MindDataTestPipeline, TestGloVeWithEmptyFile) {
  3686. // Read empty file.
  3687. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestGloVeWithEmptyFile.";
  3688. // Create a TextFile dataset
  3689. std::string data_file = datasets_root_path_ + "/testGloVe/words.txt";
  3690. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3691. EXPECT_NE(ds, nullptr);
  3692. std::string vectors_dir = datasets_root_path_ + "/testGloVe/glove.6B.empty.txt";
  3693. std::shared_ptr<GloVe> glove;
  3694. Status s = GloVe::BuildFromFile(&glove, vectors_dir);
  3695. EXPECT_NE(s, Status::OK());
  3696. }
  3697. /// Feature: GloVe
  3698. /// Description: test with the pre-vectors file that is not exist
  3699. /// Expectation: throw correct error and message
  3700. TEST_F(MindDataTestPipeline, TestGloVeWithNotExistFile) {
  3701. // Test with not exist file.
  3702. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestGloVeWithNotExistFile.";
  3703. // Create a TextFile dataset
  3704. std::string data_file = datasets_root_path_ + "/testGloVe/words.txt";
  3705. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3706. EXPECT_NE(ds, nullptr);
  3707. std::string vectors_dir = datasets_root_path_ + "/testGloVe/glove.6B.empty.txt";
  3708. std::shared_ptr<GloVe> glove;
  3709. Status s = GloVe::BuildFromFile(&glove, vectors_dir);
  3710. EXPECT_NE(s, Status::OK());
  3711. }
  3712. /// Feature: GloVe
  3713. /// Description: test with the pre-vectors set that has a situation that info-head is not the first line in the set
  3714. /// Expectation: throw correct error and message
  3715. TEST_F(MindDataTestPipeline, TestGloVeWithWrongInfoFile) {
  3716. // Wrong info.
  3717. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestGloVeWithWrongInfoFile.";
  3718. // Create a TextFile dataset
  3719. std::string data_file = datasets_root_path_ + "/testGloVe/words.txt";
  3720. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3721. EXPECT_NE(ds, nullptr);
  3722. std::string vectors_dir = datasets_root_path_ + "/testGloVe/glove.6B.with_wrong_info.txt";
  3723. std::shared_ptr<GloVe> glove;
  3724. Status s = GloVe::BuildFromFile(&glove, vectors_dir);
  3725. EXPECT_NE(s, Status::OK());
  3726. }
  3727. /// Feature: GloVe
  3728. /// Description: test with the pre-vectors set that has a wrong format
  3729. /// Expectation: throw correct error and message
  3730. TEST_F(MindDataTestPipeline, TestGloVeWithWrongFormat) {
  3731. // Wrong info.
  3732. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestGloVeWithWrongFormat.";
  3733. // Create a TextFile dataset
  3734. std::string data_file = datasets_root_path_ + "/testGloVe/words.txt";
  3735. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3736. EXPECT_NE(ds, nullptr);
  3737. std::string vectors_dir = datasets_root_path_ + "/testGloVe/glove.6B.tests.vec";
  3738. std::shared_ptr<GloVe> glove;
  3739. Status s = GloVe::BuildFromFile(&glove, vectors_dir);
  3740. EXPECT_NE(s, Status::OK());
  3741. }
  3742. /// Feature: CharNGram
  3743. /// Description: test with default parameter in function BuildFromFile and function Lookup
  3744. /// Expectation: return correct MSTensor which is equal to the excepted
  3745. TEST_F(MindDataTestPipeline, TestCharNGramDefaultParam) {
  3746. // Test with default parameter.
  3747. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCharNGramDefaultParam.";
  3748. // Create a TextFile dataset
  3749. std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
  3750. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3751. EXPECT_NE(ds, nullptr);
  3752. std::string vectors_dir = datasets_root_path_ + "/testVectors/char_n_gram_20.txt";
  3753. std::shared_ptr<CharNGram> char_n_gram;
  3754. Status s = CharNGram::BuildFromFile(&char_n_gram, vectors_dir);
  3755. EXPECT_EQ(s, Status::OK());
  3756. std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(char_n_gram);
  3757. EXPECT_NE(lookup, nullptr);
  3758. // Create Map operation on ds
  3759. ds = ds->Map({lookup}, {"text"});
  3760. EXPECT_NE(ds, nullptr);
  3761. // Create an iterator over the result of the above dataset
  3762. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  3763. EXPECT_NE(iter, nullptr);
  3764. // Iterate the dataset and get each row
  3765. std::unordered_map<std::string, mindspore::MSTensor> row;
  3766. ASSERT_OK(iter->GetNextRow(&row));
  3767. uint64_t i = 0;
  3768. std::vector<std::vector<float>> expected = {{0, 0, 0, 0, 0},
  3769. {0, 0, 0, 0, 0},
  3770. {0.117336, 0.362446, -0.983326, 0.939264, -0.05648},
  3771. {0.657201, 2.11761, -1.59276, 0.432072, 1.21395},
  3772. {0, 0, 0, 0, 0},
  3773. {-2.26956, 0.288491, -0.740001, 0.661703, 0.147355},
  3774. {0, 0, 0, 0, 0}};
  3775. while (row.size() != 0) {
  3776. auto ind = row["text"];
  3777. MS_LOG(INFO) << ind.Shape();
  3778. TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
  3779. TensorPtr de_expected_item;
  3780. ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_item));
  3781. mindspore::MSTensor ms_expected_item =
  3782. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
  3783. std::vector<int64_t> ind_shape = ind.Shape();
  3784. std::vector<int64_t> ms_expected_shape = ms_expected_item.Shape();
  3785. EXPECT_EQ(ind_shape, ms_expected_shape);
  3786. ASSERT_OK(iter->GetNextRow(&row));
  3787. i++;
  3788. }
  3789. EXPECT_EQ(i, 7);
  3790. // Manually terminate the pipeline
  3791. iter->Stop();
  3792. }
  3793. /// Feature: CharNGram.
  3794. /// Description: test with all parameters which include `path` and `max_vector` in function BuildFromFile
  3795. /// Expectation: return correct MSTensor which is equal to the excepted
  3796. TEST_F(MindDataTestPipeline, TestCharNGramAllBuildfromfileParams) {
  3797. // Test with two parameters.
  3798. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCharNGramAllBuildfromfileParams.";
  3799. // Create a TextFile dataset
  3800. std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
  3801. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3802. EXPECT_NE(ds, nullptr);
  3803. std::string vectors_dir = datasets_root_path_ + "/testVectors/char_n_gram_20.txt";
  3804. std::shared_ptr<CharNGram> char_n_gram;
  3805. Status s = CharNGram::BuildFromFile(&char_n_gram, vectors_dir, 18);
  3806. EXPECT_EQ(s, Status::OK());
  3807. std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(char_n_gram);
  3808. EXPECT_NE(lookup, nullptr);
  3809. // Create Map operation on ds
  3810. ds = ds->Map({lookup}, {"text"});
  3811. EXPECT_NE(ds, nullptr);
  3812. // Create an iterator over the result of the above dataset
  3813. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  3814. EXPECT_NE(iter, nullptr);
  3815. // Iterate the dataset and get each row
  3816. std::unordered_map<std::string, mindspore::MSTensor> row;
  3817. ASSERT_OK(iter->GetNextRow(&row));
  3818. uint64_t i = 0;
  3819. std::vector<std::vector<float>> expected = {{0, 0, 0, 0, 0},
  3820. {0, 0, 0, 0, 0},
  3821. {-0.155665, 0.664073, -0.538499, 1.22657, -0.2162},
  3822. {0.657201, 2.11761, -1.59276, 0.432072, 1.21395},
  3823. {0, 0, 0, 0, 0},
  3824. {-2.26956, 0.288491, -0.740001, 0.661703, 0.147355},
  3825. {0, 0, 0, 0, 0}};
  3826. while (row.size() != 0) {
  3827. auto ind = row["text"];
  3828. MS_LOG(INFO) << ind.Shape();
  3829. TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
  3830. TensorPtr de_expected_item;
  3831. ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_item));
  3832. mindspore::MSTensor ms_expected_item =
  3833. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
  3834. std::vector<int64_t> ind_shape = ind.Shape();
  3835. std::vector<int64_t> ms_expected_shape = ms_expected_item.Shape();
  3836. EXPECT_EQ(ind_shape, ms_expected_shape);
  3837. ASSERT_OK(iter->GetNextRow(&row));
  3838. i++;
  3839. }
  3840. EXPECT_EQ(i, 7);
  3841. // Manually terminate the pipeline
  3842. iter->Stop();
  3843. }
  3844. /// Feature: CharNGram
  3845. /// Description: test with all parameters in function BuildFromFile and `unknown_init` in function Lookup
  3846. /// Expectation: return correct MSTensor which is equal to the excepted
  3847. TEST_F(MindDataTestPipeline, TestCharNGramUnknownInit) {
  3848. // Test with two parameters.
  3849. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCharNGramUnknownInit.";
  3850. // Create a TextFile dataset
  3851. std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
  3852. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3853. EXPECT_NE(ds, nullptr);
  3854. std::string vectors_dir = datasets_root_path_ + "/testVectors/char_n_gram_20.txt";
  3855. std::shared_ptr<CharNGram> char_n_gram;
  3856. Status s = CharNGram::BuildFromFile(&char_n_gram, vectors_dir, 18);
  3857. EXPECT_EQ(s, Status::OK());
  3858. std::vector<float> unknown_init(5, -1);
  3859. std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(char_n_gram, unknown_init);
  3860. EXPECT_NE(lookup, nullptr);
  3861. // Create Map operation on ds
  3862. ds = ds->Map({lookup}, {"text"});
  3863. EXPECT_NE(ds, nullptr);
  3864. // Create an iterator over the result of the above dataset
  3865. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  3866. EXPECT_NE(iter, nullptr);
  3867. // Iterate the dataset and get each row
  3868. std::unordered_map<std::string, mindspore::MSTensor> row;
  3869. ASSERT_OK(iter->GetNextRow(&row));
  3870. uint64_t i = 0;
  3871. std::vector<std::vector<float>> expected = {{-1, -1, -1, -1, -1},
  3872. {-1, -1, -1, -1, -1},
  3873. {-0.155665, 0.664073, -0.538499, 1.22657, -0.2162},
  3874. {0.657201, 2.11761, -1.59276, 0.432072, 1.21395},
  3875. {-1, -1, -1, -1, -1},
  3876. {-2.26956, 0.288491, -0.740001, 0.661703, 0.147355},
  3877. {-1, -1, -1, -1, -1}};
  3878. while (row.size() != 0) {
  3879. auto ind = row["text"];
  3880. MS_LOG(INFO) << ind.Shape();
  3881. TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
  3882. TensorPtr de_expected_item;
  3883. ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_item));
  3884. mindspore::MSTensor ms_expected_item =
  3885. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
  3886. std::vector<int64_t> ind_shape = ind.Shape();
  3887. std::vector<int64_t> ms_expected_shape = ms_expected_item.Shape();
  3888. EXPECT_EQ(ind_shape, ms_expected_shape);
  3889. ASSERT_OK(iter->GetNextRow(&row));
  3890. i++;
  3891. }
  3892. EXPECT_EQ(i, 7);
  3893. // Manually terminate the pipeline
  3894. iter->Stop();
  3895. }
  3896. /// Feature: CharNGram
  3897. /// Description: test with all parameters which include `path` and `max_vectors` in function BuildFromFile and `token`,
  3898. /// `unknown_init` and `lower_case_backup` in function Lookup. But some tokens have some big letters
  3899. /// Expectation: return correct MSTensor which is equal to the excepted
  3900. TEST_F(MindDataTestPipeline, TestCharNGramAllParams) {
  3901. // Test with all parameters.
  3902. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCharNGramAllParams.";
  3903. // Create a TextFile dataset
  3904. std::string data_file = datasets_root_path_ + "/testVectors/words_with_big_letter.txt";
  3905. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3906. EXPECT_NE(ds, nullptr);
  3907. std::string vectors_dir = datasets_root_path_ + "/testVectors/char_n_gram_20.txt";
  3908. std::shared_ptr<CharNGram> char_n_gram;
  3909. Status s = CharNGram::BuildFromFile(&char_n_gram, vectors_dir);
  3910. EXPECT_EQ(s, Status::OK());
  3911. std::vector<float> unknown_init(5, -1);
  3912. std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(char_n_gram, unknown_init, true);
  3913. EXPECT_NE(lookup, nullptr);
  3914. // Create Map operation on ds
  3915. ds = ds->Map({lookup}, {"text"});
  3916. EXPECT_NE(ds, nullptr);
  3917. // Create an iterator over the result of the above dataset
  3918. std::shared_ptr<Iterator> iter = ds->CreateIterator();
  3919. EXPECT_NE(iter, nullptr);
  3920. // Iterate the dataset and get each row
  3921. std::unordered_map<std::string, mindspore::MSTensor> row;
  3922. ASSERT_OK(iter->GetNextRow(&row));
  3923. uint64_t i = 0;
  3924. std::vector<std::vector<float>> expected = {{-1, -1, -1, -1, -1},
  3925. {-1, -1, -1, -1, -1},
  3926. {0.117336, 0.362446, -0.983326, 0.939264, -0.05648},
  3927. {0.657201, 2.11761, -1.59276, 0.432072, 1.21395},
  3928. {-1, -1, -1, -1, -1},
  3929. {-2.26956, 0.288491, -0.740001, 0.661703, 0.147355},
  3930. {-1, -1, -1, -1, -1}};
  3931. while (row.size() != 0) {
  3932. auto ind = row["text"];
  3933. MS_LOG(INFO) << ind.Shape();
  3934. TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
  3935. TensorPtr de_expected_item;
  3936. ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_item));
  3937. mindspore::MSTensor ms_expected_item =
  3938. mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
  3939. std::vector<int64_t> ind_shape = ind.Shape();
  3940. std::vector<int64_t> ms_expected_shape = ms_expected_item.Shape();
  3941. EXPECT_EQ(ind_shape, ms_expected_shape);
  3942. ASSERT_OK(iter->GetNextRow(&row));
  3943. i++;
  3944. }
  3945. EXPECT_EQ(i, 7);
  3946. // Manually terminate the pipeline
  3947. iter->Stop();
  3948. }
  3949. /// Feature: CharNGram
  3950. /// Description: test with pre-vectors set that have the different dimension
  3951. /// Expectation: throw correct error and message
  3952. TEST_F(MindDataTestPipeline, TestCharNGramDifferentDimension) {
  3953. // Tokens don't have the same number of vectors.
  3954. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCharNGramDifferentDimension.";
  3955. // Create a TextFile dataset
  3956. std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
  3957. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3958. EXPECT_NE(ds, nullptr);
  3959. std::string vectors_dir = datasets_root_path_ + "/testVectors/char_n_gram_20_dim_different.txt";
  3960. std::shared_ptr<CharNGram> char_n_gram;
  3961. Status s = CharNGram::BuildFromFile(&char_n_gram, vectors_dir);
  3962. EXPECT_NE(s, Status::OK());
  3963. }
  3964. /// Feature: CharNGram
  3965. /// Description: test with the parameter max_vectors that is <= 0
  3966. /// Expectation: throw correct error and message
  3967. TEST_F(MindDataTestPipeline, TestCharNGramMaxVectorsLessThanZero) {
  3968. // Test with max_vectors <= 0.
  3969. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCharNGramMaxVectorsLessThanZero.";
  3970. // Create a TextFile dataset
  3971. std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
  3972. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3973. EXPECT_NE(ds, nullptr);
  3974. std::string vectors_dir = datasets_root_path_ + "/testVectors/char_n_gram_20.txt";
  3975. std::shared_ptr<CharNGram> char_n_gram;
  3976. Status s = CharNGram::BuildFromFile(&char_n_gram, vectors_dir, -1);
  3977. EXPECT_NE(s, Status::OK());
  3978. }
  3979. /// Feature: CharNGram
  3980. /// Description: test with the pre-vectors file that is empty
  3981. /// Expectation: throw correct error and message
  3982. TEST_F(MindDataTestPipeline, TestCharNGramWithEmptyFile) {
  3983. // Read empty file.
  3984. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCharNGramWithEmptyFile.";
  3985. // Create a TextFile dataset
  3986. std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
  3987. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  3988. EXPECT_NE(ds, nullptr);
  3989. std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors_empty.txt";
  3990. std::shared_ptr<CharNGram> char_n_gram;
  3991. Status s = CharNGram::BuildFromFile(&char_n_gram, vectors_dir);
  3992. EXPECT_NE(s, Status::OK());
  3993. }
  3994. /// Feature: CharNGram
  3995. /// Description: test with the pre-vectors file that is not exist
  3996. /// Expectation: throw correct error and message
  3997. TEST_F(MindDataTestPipeline, TestCharNGramsWithNotExistFile) {
  3998. // Test with not exist file.
  3999. MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCharNGramsWithNotExistFile.";
  4000. // Create a TextFile dataset
  4001. std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
  4002. std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
  4003. EXPECT_NE(ds, nullptr);
  4004. std::string vectors_dir = datasets_root_path_ + "/testVectors/no_vectors.txt";
  4005. std::shared_ptr<CharNGram> char_n_gram;
  4006. Status s = CharNGram::BuildFromFile(&char_n_gram, vectors_dir);
  4007. EXPECT_NE(s, Status::OK());
  4008. }