You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

data_schema.cc 20 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470
  1. /**
  2. * Copyright 2019 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "dataset/engine/data_schema.h"
  17. #include <algorithm>
  18. #include <fstream>
  19. #include <iostream>
  20. #include <map>
  21. #include <memory>
  22. #include <sstream>
  23. #include <nlohmann/json.hpp>
  24. #include "common/utils.h"
  25. #include "dataset/util/status.h"
  26. #include "dataset/core/tensor_shape.h"
  27. #include "utils/log_adapter.h"
  28. #include "dataset/util/de_error.h"
  29. namespace mindspore {
  30. namespace dataset {
  31. // A macro for converting an input string representing the column type to it's actual
  32. // numeric column type.
  33. #define STR_TO_TENSORIMPL(in_col_str, out_type) \
  34. do { \
  35. if (in_col_str == "cvmat") { \
  36. out_type = TensorImpl::kCv; \
  37. } else if (in_col_str == "flex") { \
  38. out_type = TensorImpl::kFlexible; \
  39. } else if (in_col_str == "np") { \
  40. out_type = TensorImpl::kNP; \
  41. } else { \
  42. out_type = TensorImpl::kNone; \
  43. } \
  44. } while (false)
  45. // Constructor 1: Simple constructor that leaves things uninitialized.
  46. ColDescriptor::ColDescriptor()
  47. : type_(DataType::DE_UNKNOWN), rank_(0), tensor_impl_(TensorImpl::kNone), tensor_shape_(nullptr) {}
  48. // Constructor 2: Main constructor
  49. ColDescriptor::ColDescriptor(const std::string &col_name, DataType col_type, TensorImpl tensor_impl, int32_t rank,
  50. const TensorShape *in_shape)
  51. : type_(col_type), rank_(rank), tensor_impl_(tensor_impl), col_name_(col_name) {
  52. // If a shape was provided, create unique pointer for it and copy construct it into
  53. // our shape. Otherwise, set our shape to be empty.
  54. if (in_shape != nullptr) {
  55. // Create a shape and copy construct it into our column's shape.
  56. tensor_shape_ = std::make_unique<TensorShape>(*in_shape);
  57. } else {
  58. tensor_shape_ = nullptr;
  59. }
  60. // If the user input a shape, then the rank of the input shape needs to match
  61. // the input rank
  62. if (in_shape != nullptr && in_shape->known() && in_shape->Size() != rank_) {
  63. rank_ = in_shape->Size();
  64. MS_LOG(INFO) << "Rank does not match the number of dimensions in the provided shape."
  65. << " Overriding rank with the number of dimensions in the provided shape.";
  66. }
  67. }
  68. // Explicit copy constructor is required
  69. ColDescriptor::ColDescriptor(const ColDescriptor &in_cd)
  70. : type_(in_cd.type_), rank_(in_cd.rank_), tensor_impl_(in_cd.tensor_impl_), col_name_(in_cd.col_name_) {
  71. // If it has a tensor shape, make a copy of it with our own unique_ptr.
  72. tensor_shape_ = in_cd.hasShape() ? std::make_unique<TensorShape>(in_cd.shape()) : nullptr;
  73. }
  74. // Assignment overload
  75. ColDescriptor &ColDescriptor::operator=(const ColDescriptor &in_cd) {
  76. if (&in_cd != this) {
  77. type_ = in_cd.type_;
  78. rank_ = in_cd.rank_;
  79. tensor_impl_ = in_cd.tensor_impl_;
  80. col_name_ = in_cd.col_name_;
  81. // If it has a tensor shape, make a copy of it with our own unique_ptr.
  82. tensor_shape_ = in_cd.hasShape() ? std::make_unique<TensorShape>(in_cd.shape()) : nullptr;
  83. }
  84. return *this;
  85. }
  86. // Destructor
  87. ColDescriptor::~ColDescriptor() = default;
  88. // A print method typically used for debugging
  89. void ColDescriptor::Print(std::ostream &out) const {
  90. out << " Name : " << col_name_ << "\n Type : " << type_ << "\n Rank : " << rank_
  91. << "\n Shape : (";
  92. if (tensor_shape_) {
  93. out << *tensor_shape_ << ")\n";
  94. } else {
  95. out << "no shape provided)\n";
  96. }
  97. }
  98. // Given a number of elements, this function will compute what the actual Tensor shape would be.
  99. // If there is no starting TensorShape in this column, or if there is a shape but it contains
  100. // an unknown dimension, then the output shape returned shall resolve dimensions as needed.
  101. Status ColDescriptor::MaterializeTensorShape(int32_t num_elements, TensorShape *out_shape) const {
  102. if (out_shape == nullptr) {
  103. RETURN_STATUS_UNEXPECTED("Unexpected null output shape argument.");
  104. }
  105. // If the shape is not given in this column, then we assume the shape will be: {numElements}
  106. if (tensor_shape_ == nullptr) {
  107. if (this->rank() == 0 && num_elements == 1) {
  108. *out_shape = TensorShape::CreateScalar();
  109. return Status::OK();
  110. }
  111. *out_shape = TensorShape({num_elements});
  112. return Status::OK();
  113. }
  114. // Build the real TensorShape based on the requested shape and the number of elements in the data.
  115. // If there are unknown dimensions, then the unknown dimension needs to be filled in.
  116. // Example: requestedShape: {?,4,3}.
  117. // If numElements is 24, then the output shape can be computed to: {2,4,3}
  118. std::vector<dsize_t> requested_shape = tensor_shape_->AsVector();
  119. int64_t num_elements_of_shape = 1; // init to 1 as a starting multiplier.
  120. // unknownDimPosition variable is overloaded to provide 2 meanings:
  121. // 1) If it's set to DIM_UNKNOWN, then it provides a boolean knowledge to tell us if there are
  122. // any unknown dimensions. i.e. if it's set to unknown, then there are no unknown dimensions.
  123. // 2) If it's set to a numeric value, then this is the vector index position within the shape
  124. // where the single unknown dimension can be found.
  125. int64_t unknown_dim_position = TensorShape::kDimUnknown; // Assume there are no unknown dims to start
  126. for (int i = 0; i < requested_shape.size(); ++i) {
  127. // If we already had an unknown dimension, then we cannot have a second unknown dimension.
  128. // We only support the compute of a single unknown dim.
  129. if (requested_shape[i] == TensorShape::kDimUnknown && unknown_dim_position != TensorShape::kDimUnknown) {
  130. return Status(StatusCode::kUnexpectedError, __LINE__, __FILE__,
  131. "Requested shape has more than one unknown dimension!");
  132. }
  133. // If the current dimension in the requested shape is a known value, then compute the number of
  134. // elements so far.
  135. if (requested_shape[i] != TensorShape::kDimUnknown) {
  136. num_elements_of_shape *= requested_shape[i];
  137. } else {
  138. // This dimension is unknown so track which dimension position has it.
  139. unknown_dim_position = i;
  140. }
  141. }
  142. // Sanity check the the computed element counts divide evenly into the input element count
  143. if (num_elements < num_elements_of_shape || num_elements_of_shape == 0 || num_elements % num_elements_of_shape != 0) {
  144. RETURN_STATUS_UNEXPECTED("Requested shape has an invalid element count!");
  145. }
  146. // If there was any unknown dimensions, then update the requested shape to fill in the unknown
  147. // dimension with the correct value. If there were no unknown dim's then the output shape will
  148. // remain to be the same as the requested shape.
  149. if (unknown_dim_position != TensorShape::kDimUnknown) {
  150. requested_shape[unknown_dim_position] = (num_elements / num_elements_of_shape);
  151. }
  152. // Any unknown dimension is filled in now. Set the output shape
  153. *out_shape = TensorShape(requested_shape);
  154. return Status::OK();
  155. }
  156. // getter function for the shape
  157. TensorShape ColDescriptor::shape() const {
  158. if (tensor_shape_ != nullptr) {
  159. return *tensor_shape_; // copy construct a shape to return
  160. } else {
  161. return TensorShape::CreateUnknownRankShape(); // empty shape to return
  162. }
  163. }
  164. const char DataSchema::DEFAULT_DATA_SCHEMA_FILENAME[] = "datasetSchema.json";
  165. // Constructor 1: Simple constructor that leaves things uninitialized.
  166. DataSchema::DataSchema() : dataset_type_(DatasetType::kUnknown), num_rows_(0) {}
  167. DatasetType DataSchema::GetDatasetTYpeFromString(const std::string &type) const {
  168. // Convert the string to a more easy to manage enum flavour of the buffer type.
  169. if (type == "ARROW") {
  170. return DatasetType::kArrow;
  171. } else if (type == "TF") {
  172. return DatasetType::kTf;
  173. } else {
  174. return DatasetType::kUnknown;
  175. }
  176. }
  177. Status DataSchema::LoadDatasetType(const std::string &schema_file_path) {
  178. try {
  179. std::ifstream in(schema_file_path);
  180. nlohmann::json js;
  181. in >> js;
  182. // First, get the column for the type of dataset.
  183. dataset_type_str_ = js.value("datasetType", "");
  184. dataset_type_ = GetDatasetTYpeFromString(dataset_type_str_);
  185. dir_structure_ = js.value("directoryStructure", "");
  186. }
  187. // Catch any exception and convert to Status return code
  188. catch (const std::exception &err) {
  189. RETURN_STATUS_UNEXPECTED("Schema file failed to load");
  190. }
  191. return Status::OK();
  192. }
  193. // Internal helper function. Parses the json schema file in any order and produces a schema that
  194. // does not follow any particular order (json standard does not enforce any ordering protocol).
  195. // This one produces a schema that contains all of the columns from the schema file.
  196. Status DataSchema::AnyOrderLoad(nlohmann::json column_tree) {
  197. // Iterate over the json file. Each parent json node is the column name,
  198. // followed by the column properties in the child tree under the column.
  199. // Outer loop here iterates over the parents (i.e. the column name)
  200. if (!column_tree.is_array()) {
  201. for (nlohmann::json::iterator it = column_tree.begin(); it != column_tree.end(); ++it) {
  202. std::string col_name = it.key();
  203. nlohmann::json column_child_tree = it.value();
  204. RETURN_IF_NOT_OK(ColumnLoad(column_child_tree, col_name));
  205. }
  206. } else {
  207. // Case where the schema is a list of columns not a dict
  208. for (nlohmann::json::iterator it = column_tree.begin(); it != column_tree.end(); ++it) {
  209. nlohmann::json column_child_tree = it.value();
  210. RETURN_IF_NOT_OK(ColumnLoad(column_child_tree, ""));
  211. }
  212. }
  213. return Status::OK();
  214. }
  215. // Internal helper function. For each input column name, perform a lookup to the json document to
  216. // find the matching column. When the match is found, process that column to build the column
  217. // descriptor and add to the schema in the order in which the input column names are given.id
  218. Status DataSchema::ColumnOrderLoad(nlohmann::json column_tree, const std::vector<std::string> &columns_to_load) {
  219. if (!column_tree.is_array()) {
  220. // the json file is dict (e.g., {image: ...})
  221. // Loop over the column name list
  222. for (const auto &curr_col_name : columns_to_load) {
  223. // Find the column in the json document
  224. auto column_info = column_tree.find(common::SafeCStr(curr_col_name));
  225. if (column_info == column_tree.end()) {
  226. RETURN_STATUS_UNEXPECTED("Failed to find column " + curr_col_name);
  227. }
  228. // At this point, columnInfo.value() is the subtree in the json document that contains
  229. // all of the data for a given column. This data will formulate our schema column.
  230. const std::string &col_name = column_info.key();
  231. nlohmann::json column_child_tree = column_info.value();
  232. RETURN_IF_NOT_OK(ColumnLoad(column_child_tree, col_name));
  233. }
  234. } else {
  235. // the json file is array (e.g., [name: image...])
  236. // Loop over the column name list
  237. for (const auto &curr_col_name : columns_to_load) {
  238. // Find the column in the json document
  239. int32_t index = -1;
  240. int32_t i = 0;
  241. for (const auto &it_child : column_tree.items()) {
  242. auto name = it_child.value().find("name");
  243. if (name == it_child.value().end()) {
  244. RETURN_STATUS_UNEXPECTED("Name field is missing for this column.");
  245. }
  246. if (name.value() == curr_col_name) {
  247. index = i;
  248. break;
  249. }
  250. i++;
  251. }
  252. if (index == -1) {
  253. RETURN_STATUS_UNEXPECTED("Failed to find column " + curr_col_name);
  254. }
  255. nlohmann::json column_child_tree = column_tree[index];
  256. RETURN_IF_NOT_OK(ColumnLoad(column_child_tree, curr_col_name));
  257. }
  258. }
  259. return Status::OK();
  260. }
  261. // Internal helper function for parsing shape info and building a vector for the shape construction.
  262. static Status buildShape(const nlohmann::json &shapeVal, std::vector<dsize_t> *outShape) {
  263. if (outShape == nullptr) {
  264. RETURN_STATUS_UNEXPECTED("null output shape");
  265. }
  266. if (shapeVal.empty()) return Status::OK();
  267. // Iterate over the integer list and add those values to the output shape tensor
  268. auto items = shapeVal.items();
  269. using it_type = decltype(items.begin());
  270. (void)std::transform(items.begin(), items.end(), std::back_inserter(*outShape), [](it_type j) { return j.value(); });
  271. return Status::OK();
  272. }
  273. // Internal helper function. Given the json tree for a given column, load it into our schema.
  274. Status DataSchema::ColumnLoad(nlohmann::json column_child_tree, const std::string &col_name) {
  275. int32_t rank_value = -1;
  276. TensorImpl t_impl_value = TensorImpl::kFlexible;
  277. std::string name, type_str;
  278. std::vector<dsize_t> tmp_shape = {};
  279. bool shape_field_exists = false;
  280. // Iterate over this column's attributes.
  281. // Manually iterating each of the child nodes/trees here so that we can provide our own error handling.
  282. for (const auto &it_child : column_child_tree.items()) {
  283. // Save the data for each of the attributes into variables. We'll use these to construct later.
  284. if (it_child.key() == "name") {
  285. name = it_child.value();
  286. } else if (it_child.key() == "type") {
  287. type_str = it_child.value();
  288. } else if (it_child.key() == "rank") {
  289. rank_value = it_child.value();
  290. } else if (it_child.key() == "t_impl") {
  291. STR_TO_TENSORIMPL(it_child.value(), t_impl_value);
  292. } else if (it_child.key() == "shape") {
  293. shape_field_exists = true;
  294. RETURN_IF_NOT_OK(buildShape(it_child.value(), &tmp_shape));
  295. } else {
  296. std::string err_msg = "Unexpected column attribute " + it_child.key() + " for column " + col_name;
  297. RETURN_STATUS_UNEXPECTED(err_msg);
  298. }
  299. }
  300. if (!name.empty()) {
  301. if (!col_name.empty() && col_name != name) {
  302. std::string err_msg =
  303. "json schema file for column " + col_name + " has column name that does not match columnsToLoad";
  304. RETURN_STATUS_UNEXPECTED(err_msg);
  305. }
  306. } else {
  307. if (col_name.empty()) {
  308. std::string err_msg = "json schema file for column " + col_name + " has invalid or missing column name.";
  309. RETURN_STATUS_UNEXPECTED(err_msg);
  310. } else {
  311. name = col_name;
  312. }
  313. }
  314. // data type is mandatory field
  315. if (type_str.empty())
  316. return Status(StatusCode::kUnexpectedError, __LINE__, __FILE__,
  317. "json schema file for column " + col_name + " has invalid or missing column type.");
  318. // rank number is mandatory field
  319. if (rank_value <= -1)
  320. return Status(StatusCode::kUnexpectedError, __LINE__, __FILE__,
  321. "json schema file for column " + col_name + " must define a positive rank value.");
  322. // Create the column descriptor for this column from the data we pulled from the json file
  323. TensorShape col_shape = TensorShape(tmp_shape);
  324. if (shape_field_exists)
  325. (void)this->AddColumn(ColDescriptor(name, DataType(type_str), t_impl_value, rank_value, &col_shape));
  326. else
  327. // Create a column descriptor that doesn't have a shape
  328. (void)this->AddColumn(ColDescriptor(name, DataType(type_str), t_impl_value, rank_value));
  329. return Status::OK();
  330. }
  331. // Parses a schema json file and populates the columns and meta info.
  332. Status DataSchema::LoadSchemaFile(const std::string &schema_file_path,
  333. const std::vector<std::string> &columns_to_load) {
  334. try {
  335. std::ifstream in(schema_file_path);
  336. nlohmann::json js;
  337. in >> js;
  338. RETURN_IF_NOT_OK(PreLoadExceptionCheck(js));
  339. try {
  340. num_rows_ = js.at("numRows").get<int64_t>();
  341. } catch (nlohmann::json::out_of_range &e) {
  342. num_rows_ = 0;
  343. } catch (nlohmann::json::exception &e) {
  344. RETURN_STATUS_UNEXPECTED("Unable to parse \"numRows\" from schema");
  345. }
  346. nlohmann::json column_tree = js.at("columns");
  347. if (column_tree.empty()) {
  348. RETURN_STATUS_UNEXPECTED("columns is null");
  349. }
  350. if (columns_to_load.empty()) {
  351. // Parse the json tree and load the schema's columns in whatever order that the json
  352. // layout decides
  353. RETURN_IF_NOT_OK(this->AnyOrderLoad(column_tree));
  354. } else {
  355. RETURN_IF_NOT_OK(this->ColumnOrderLoad(column_tree, columns_to_load));
  356. }
  357. } catch (const std::exception &err) {
  358. // Catch any exception and convert to Status return code
  359. RETURN_STATUS_UNEXPECTED("Schema file failed to load");
  360. }
  361. return Status::OK();
  362. }
  363. // Parses a schema json string and populates the columns and meta info.
  364. Status DataSchema::LoadSchemaString(const std::string &schema_json_string,
  365. const std::vector<std::string> &columns_to_load) {
  366. try {
  367. nlohmann::json js = nlohmann::json::parse(schema_json_string);
  368. RETURN_IF_NOT_OK(PreLoadExceptionCheck(js));
  369. num_rows_ = js.value("numRows", 0);
  370. dataset_type_str_ = js.value("datasetType", "");
  371. dataset_type_ = GetDatasetTYpeFromString(dataset_type_str_);
  372. nlohmann::json column_tree = js.at("columns");
  373. if (column_tree.empty()) {
  374. RETURN_STATUS_UNEXPECTED("columns is null");
  375. }
  376. if (columns_to_load.empty()) {
  377. // Parse the json tree and load the schema's columns in whatever order that the json
  378. // layout decides
  379. RETURN_IF_NOT_OK(this->AnyOrderLoad(column_tree));
  380. } else {
  381. RETURN_IF_NOT_OK(this->ColumnOrderLoad(column_tree, columns_to_load));
  382. }
  383. } catch (const std::exception &err) {
  384. // Catch any exception and convert to Status return code
  385. RETURN_STATUS_UNEXPECTED("Schema file failed to load");
  386. }
  387. return Status::OK();
  388. }
  389. // Destructor
  390. DataSchema::~DataSchema() = default;
  391. // Getter for the ColDescriptor by index
  392. const ColDescriptor &DataSchema::column(int32_t idx) const {
  393. DS_ASSERT(idx < static_cast<int>(col_descs_.size()));
  394. return col_descs_[idx];
  395. }
  396. // A print method typically used for debugging
  397. void DataSchema::Print(std::ostream &out) const {
  398. out << "Dataset type string : (";
  399. if (dataset_type_str_.empty()) {
  400. out << "none specified)\n";
  401. } else {
  402. out << dataset_type_str_ << ")\n";
  403. }
  404. for (const auto &col_desc : col_descs_) {
  405. out << col_desc << "\n";
  406. }
  407. out << "Dataset type: " << static_cast<uint32_t>(dataset_type_) << "\n";
  408. }
  409. // Adds a column descriptor to the schema
  410. Status DataSchema::AddColumn(const ColDescriptor &cd) {
  411. // Sanity check there's not a duplicate name before adding the column
  412. for (int32_t i = 0; i < col_descs_.size(); ++i) {
  413. if (col_descs_[i].name() == cd.name()) {
  414. std::ostringstream ss;
  415. ss << "column name '" << cd.name() << "' already exists in schema.";
  416. std::string err_msg = ss.str();
  417. RETURN_STATUS_UNEXPECTED(err_msg);
  418. }
  419. }
  420. col_descs_.push_back(cd);
  421. return Status::OK();
  422. }
  423. // Internal helper function. Performs sanity checks on the json file setup.
  424. Status DataSchema::PreLoadExceptionCheck(const nlohmann::json &js) {
  425. // Check if columns node exists. It is required for building schema from file.
  426. if (js.find("columns") == js.end())
  427. return Status(StatusCode::kUnexpectedError, __LINE__, __FILE__,
  428. "\"columns\" node is required in the schema json file.");
  429. return Status::OK();
  430. }
  431. } // namespace dataset
  432. } // namespace mindspore