Alexandria  2.16
Please provide a description of the project.
AsciiReaderHelper.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2012-2020 Euclid Science Ground Segment
3  *
4  * This library is free software; you can redistribute it and/or modify it under
5  * the terms of the GNU Lesser General Public License as published by the Free
6  * Software Foundation; either version 3.0 of the License, or (at your option)
7  * any later version.
8  *
9  * This library is distributed in the hope that it will be useful, but WITHOUT
10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11  * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
12  * details.
13  *
14  * You should have received a copy of the GNU Lesser General Public License
15  * along with this library; if not, write to the Free Software Foundation, Inc.,
16  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17  */
18 
25 #include <set>
26 #include <sstream>
27 #include <boost/algorithm/string.hpp>
28 #include <boost/lexical_cast.hpp>
29 #include <boost/tokenizer.hpp>
31 #include "ElementsKernel/Logging.h"
32 #include "AsciiReaderHelper.h"
33 #include "NdArray/NdArray.h"
34 
35 namespace Euclid {
36 namespace Table {
37 
38 using NdArray::NdArray;
39 
41 
42 size_t countColumns(std::istream& in, const std::string& comment) {
43  StreamRewinder rewinder {in};
44  size_t count = 0;
45 
46  while (in) {
47  std::string line;
48  getline(in, line);
49  // Remove any comments
50  size_t comment_pos = line.find(comment);
51  if (comment_pos != std::string::npos) {
52  line = line.substr(0, comment_pos);
53  }
54  boost::trim(line);
55  if (!line.empty()) {
56  std::string token;
57  std::stringstream line_stream(line);
58  line_stream >> boost::io::quoted(token);
59  while (line_stream) {
60  line_stream >> boost::io::quoted(token);
61  ++count;
62  }
63  break;
64  }
65  }
66  if (count == 0) {
67  throw Elements::Exception() << "No data lines found";
68  }
69  return count;
70 }
71 
73  if (keyword == "bool" || keyword == "boolean") {
74  return typeid(bool);
75  } else if (keyword == "int" || keyword == "int32") {
76  return typeid(int32_t);
77  } else if (keyword == "long" || keyword == "int64") {
78  return typeid(int64_t);
79  } else if (keyword == "float") {
80  return typeid(float);
81  } else if (keyword == "double") {
82  return typeid(double);
83  } else if (keyword == "string") {
84  return typeid(std::string);
85  } else if (keyword == "[bool]" || keyword == "[boolean]") {
86  return typeid(std::vector<bool>);
87  } else if (keyword == "[int]" || keyword == "[int32]") {
88  return typeid(std::vector<int32_t>);
89  } else if (keyword == "[long]" || keyword == "[int64]") {
90  return typeid(std::vector<int64_t>);
91  } else if (keyword == "[float]") {
92  return typeid(std::vector<float>);
93  } else if (keyword == "[double]") {
94  return typeid(std::vector<double>);
95  } else if (keyword == "[bool+]" || keyword == "[boolean+]") {
96  return typeid(NdArray<bool>);
97  } else if (keyword == "[int+]" || keyword == "[int32+]") {
98  return typeid(NdArray<int32_t>);
99  } else if (keyword == "[long+]" || keyword == "[int64+]") {
100  return typeid(NdArray<int64_t>);
101  } else if (keyword == "[float+]") {
102  return typeid(NdArray<float>);
103  } else if (keyword == "[double+]") {
104  return typeid(NdArray<double>);
105  }
106  throw Elements::Exception() << "Unknown column type keyword " << keyword;
107 }
108 
110  std::istream& in, const std::string& comment) {
111  StreamRewinder rewinder {in};
113  while (in) {
114  std::string line;
115  getline(in, line);
116  boost::trim(line);
117  if (line.empty()) {
118  continue; // We skip empty lines
119  }
120  if (boost::starts_with(line, comment)) {
121  // If we have a comment we remove all comment characters and check if we have
122  // a column description
123  boost::replace_all(line, comment, "");
124  boost::trim(line);
125  if (boost::starts_with(line, "Column:")) {
126  line.erase(0, 7);
127  boost::trim(line);
128  if (!line.empty()) {
129  std::string token;
130  std::stringstream line_stream(line);
131  std::string name;
132  line_stream >> boost::io::quoted(name);
133  if (descriptions.count(name) != 0) {
134  throw Elements::Exception() << "Duplicate column name " << name;
135  }
136  line_stream >> boost::io::quoted(token);
137  std::type_index type = typeid(std::string);
138  if (line_stream) {
139  if (!boost::starts_with(token, "(") && token != "-") {
140  type = keywordToType(token);
141  line_stream >> boost::io::quoted(token);
142  }
143  }
144  std::string unit = "";
145  if (line_stream) {
146  if (boost::starts_with(token, "(")) {
147  unit = token;
148  unit.erase(unit.begin());
149  unit.erase(unit.end()-1);
150  line_stream >> boost::io::quoted(token);
151  }
152  }
153  if (line_stream && token == "-") {
154  line_stream >> boost::io::quoted(token);
155  }
156  std::stringstream desc;
157  while (line_stream) {
158  desc << token << ' ';
159  line_stream >> boost::io::quoted(token);
160  }
161  std::string desc_str = desc.str();
162  boost::trim(desc_str);
163  descriptions.emplace(std::piecewise_construct,
164  std::forward_as_tuple(name),
165  std::forward_as_tuple(name, type, unit, desc_str));
166  }
167  }
168  } else {
169  break; // here we reached the first data line
170  }
171  }
172  return descriptions;
173 }
174 
176  const std::string& comment,
177  size_t columns_number) {
178  StreamRewinder rewinder {in};
179  std::vector<std::string> names {};
180 
181  // Find the last comment line and at the same time read the names of the
182  // column info description comments
183  std::string last_comment {};
184  std::vector<std::string> desc_names {};
185  while (in) {
186  std::string line;
187  getline(in, line);
188  boost::trim(line);
189  if (line.empty()) {
190  continue; // We skip empty lines
191  }
192  if (boost::starts_with(line, comment)) {
193  // If we have a comment we remove all comment characters and check if we have
194  // the correct number of tokens
195  boost::replace_all(line, comment, "");
196  boost::trim(line);
197  if (!line.empty()) {
198  last_comment = line;
199  }
200  if (boost::starts_with(line, "Column:")) {
201  std::string temp = line;
202  temp.erase(0, 7);
203  boost::trim(temp);
204  auto space_i = temp.find(' ');
205  if (space_i > 0) {
206  temp = temp.substr(0, space_i);
207  }
208  desc_names.emplace_back(std::move(temp));
209  }
210  } else {
211  break; // here we reached the first data line
212  }
213  }
214 
215  // Check if the last comment line contains the names of the columns
216  if (!last_comment.empty()){
217  std::stringstream line_stream(last_comment);
218  std::string token;
219  line_stream >> boost::io::quoted(token);
220  while (line_stream) {
221  names.push_back(token);
222  line_stream >> boost::io::quoted(token);
223  }
224  if (names.size() != columns_number) {
225  names.clear();
226  }
227  }
228 
229  // If the names are empty we fill them with the column descriprion ones
230  if (names.empty()) {
231  if (desc_names.size() != 0 && desc_names.size() != columns_number) {
232  logger.warn() << "Number of column descriptions does not matches the number"
233  << " of the columns";
234  }
235  names = desc_names;
236  }
237 
238  if (names.size() < columns_number) {
239  for (size_t i=names.size()+1; i<=columns_number; ++i) {
240  names.push_back("col" + std::to_string(i));
241  }
242  }
243  // Check for duplicate names
244  std::set<std::string> set {};
245  for (auto name : names) {
246  if (!set.insert(name).second) {
247  throw Elements::Exception() << "Duplicate column name " << name;
248  }
249  }
250  return names;
251 }
252 
253 namespace {
254 
255 template <typename T>
256 std::vector<T> convertStringToVector(const std::string& str) {
257  std::vector<T> result {};
258  boost::char_separator<char> sep {","};
259  boost::tokenizer< boost::char_separator<char> > tok {str, sep};
260  for (auto& s : tok) {
261  result.push_back(boost::get<T>(convertToCellType(s, typeid(T))));
262  }
263  return result;
264 }
265 
266 template <typename T>
267 NdArray<T> convertStringToNdArray(const std::string& str) {
268  if (str.empty()) {
269  throw Elements::Exception() << "Cannot convert an empty string to a NdArray";
270  } else if (str[0] != '<') {
271  throw Elements::Exception() << "Unexpected initial character for a NdArray: " << str[0];
272  }
273 
274  auto closing_char = str.find('>');
275  if (closing_char == std::string::npos) {
276  throw Elements::Exception() << "Could not find '>'";
277  }
278 
279  auto shape_str = str.substr(1, closing_char - 1);
280  auto shape_i = convertStringToVector<int32_t>(shape_str);
281  auto data = convertStringToVector<T>(str.substr(closing_char + 1));
282 
283  std::vector<size_t> shape_u;
284  std::copy(shape_i.begin(), shape_i.end(), std::back_inserter(shape_u));
285  return NdArray<T>(shape_u, data);
286 }
287 
288 }
289 
291  try {
292  if (type == typeid(bool)) {
293  if (value == "true" || value == "t" || value == "yes" || value == "y" || value == "1") {
294  return Row::cell_type {true};
295  }
296  if (value == "false" || value == "f" || value == "no" || value == "n" || value == "0") {
297  return Row::cell_type {false};
298  }
299  } else if (type == typeid(int32_t)) {
300  return Row::cell_type {boost::lexical_cast<int32_t>(value)};
301  } else if (type == typeid(int64_t)) {
302  return Row::cell_type {boost::lexical_cast<int64_t>(value)};
303  } else if (type == typeid(float)) {
304  return Row::cell_type {boost::lexical_cast<float>(value)};
305  } else if (type == typeid(double)) {
306  return Row::cell_type {boost::lexical_cast<double>(value)};
307  } else if (type == typeid(std::string)) {
308  return Row::cell_type {boost::lexical_cast<std::string>(value)};
309  } else if (type == typeid(std::vector<bool>)) {
310  return Row::cell_type {convertStringToVector<bool>(value)};
311  } else if (type == typeid(std::vector<int32_t>)) {
312  return Row::cell_type {convertStringToVector<int32_t>(value)};
313  } else if (type == typeid(std::vector<int64_t>)) {
314  return Row::cell_type {convertStringToVector<int64_t>(value)};
315  } else if (type == typeid(std::vector<float>)) {
316  return Row::cell_type {convertStringToVector<float>(value)};
317  } else if (type == typeid(std::vector<double>)) {
318  return Row::cell_type {convertStringToVector<double>(value)};
319  } else if (type == typeid(NdArray<bool>)) {
320  return Row::cell_type {convertStringToNdArray<bool>(value)};
321  } else if (type == typeid(NdArray<int32_t>)) {
322  return Row::cell_type {convertStringToNdArray<int32_t>(value)};
323  } else if (type == typeid(NdArray<int64_t>)) {
324  return Row::cell_type {convertStringToNdArray<int64_t>(value)};
325  } else if (type == typeid(NdArray<float>)) {
326  return Row::cell_type {convertStringToNdArray<float>(value)};
327  } else if (type == typeid(NdArray<double>)) {
328  return Row::cell_type {convertStringToNdArray<double>(value)};
329  }
330  } catch( boost::bad_lexical_cast const& ) {
331  throw Elements::Exception() << "Cannot convert " << value << " to " << type.name();
332  }
333  throw Elements::Exception() << "Unknown type name " << type.name();
334 }
335 
336 bool hasNextRow(std::istream& in, const std::string& comment) {
337  StreamRewinder rewinder {in};
338  while(in) {
339  std::string line;
340  getline(in, line);
341  size_t comment_pos = line.find(comment);
342  if (comment_pos != std::string::npos) {
343  line = line.substr(0, comment_pos);
344  }
345  boost::trim(line);
346  if (!line.empty()) {
347  return true;
348  }
349  }
350  return false;
351 }
352 
354  StreamRewinder rewinder {in};
355  std::size_t count = 0;
356  while(in) {
357  std::string line;
358  getline(in, line);
359  size_t comment_pos = line.find(comment);
360  if (comment_pos != std::string::npos) {
361  line = line.substr(0, comment_pos);
362  }
363  boost::trim(line);
364  if (!line.empty()) {
365  ++count;
366  }
367  }
368  return count;
369 }
370 
371 }
372 } // end of namespace Euclid
std::size_t countRemainingRows(std::istream &in, const std::string &comment)
T empty(T... args)
T copy(T... args)
T forward_as_tuple(T... args)
boost::variant< bool, int32_t, int64_t, float, double, std::string, std::vector< bool >, std::vector< int32_t >, std::vector< int64_t >, std::vector< float >, std::vector< double >, NdArray::NdArray< bool >, NdArray::NdArray< int32_t >, NdArray::NdArray< int64_t >, NdArray::NdArray< float >, NdArray::NdArray< double > > cell_type
The possible cell types.
Definition: Row.h:84
constexpr double second
T to_string(T... args)
static Elements::Logging logger
T end(T... args)
Row::cell_type convertToCellType(const std::string &value, std::type_index type)
Converts the given value to a Row::cell_type of the given type.
STL class.
STL class.
std::type_index keywordToType(const std::string &keyword)
STL class.
T push_back(T... args)
NdArray(const std::vector< size_t > &shape)
Definition: NdArray.h:62
void warn(const std::string &logMessage)
T erase(T... args)
T str(T... args)
T move(T... args)
std::map< std::string, ColumnDescription > autoDetectColumnDescriptions(std::istream &in, const std::string &comment)
Reads the column descriptions of the given stream.
This class gets a stream as argument during construction and when it is deleted it sets the position ...
T count(T... args)
bool hasNextRow(std::istream &in, const std::string &comment)
T find(T... args)
std::string quoted(const std::string &str)
STL class.
STL class.
T name(T... args)
T begin(T... args)
T back_inserter(T... args)
T emplace(T... args)
T substr(T... args)
static Logging getLogger(const std::string &name="")
size_t countColumns(std::istream &in, const std::string &comment)
Returns the number of whitespace separated tokens of the first non commented line.
std::vector< std::string > autoDetectColumnNames(std::istream &in, const std::string &comment, size_t columns_number)
Reads the column names of the given stream.