A very simple line oriented parser that tokenizes the data on each line. More...
#include <simple_file_parser.h>
Public Types | |
| typedef std::vector< std::string > | tokens_t |
| Container for the tokens. | |
Public Member Functions | |
| simple_file_parser () | |
| Default constructor. | |
| ~simple_file_parser () | |
| Destructor. | |
| void | close () |
| End the parsing operation. | |
| bool | eof () const |
| Are we at the end of the file? | |
| const std::string & | get_file_name () const |
| Get the file name. | |
| unsigned | get_line_number () const |
| Get the line number. | |
| bool | get_next_line (size_t num_tokens, const std::string &tok1="", const std::string &tok2="", const std::string &tok3="", const std::string &tok4="", const std::string &tok5="", const std::string &tok6="", const std::string &tok7="", const std::string &tok8="") |
| Get tokens from next line with expected values. | |
| size_t | get_next_line () |
| Get tokens from the next line. | |
| size_t | get_num_tokens () const |
| Get the number of tokens just parsed. | |
| const char * | get_single_char_tokens () const |
| Get the single character tokens. | |
| const std::string & | get_token (size_t i) const |
| Get the i-th token. | |
| unsigned | get_token_int (size_t i) const |
| Get the i-th token as a signed integer. | |
| unsigned | get_token_uint (size_t i) const |
| Get the i-th token as an unsigned integer. | |
| bool | is_token_int (size_t i) const |
| Is the i-th token an integer? | |
| bool | is_token_uint (size_t i) const |
| Is the i-th token an unsigned integer? | |
| bool | open (const std::string &fn) |
| Begin the parsing operation by opening the file and setting up the internal state. | |
| void | rewind () |
| Rewind to the beginning of the file. | |
| void | set_single_char_tokens (const char *tokens) |
| Define the single character tokens. | |
Static Public Member Functions | |
| static bool | file_exists (const std::string &fn) |
| Does the file exist (const string parameter)? | |
| static bool | file_exists (const char *fn) |
| Does the file exist? | |
Private Member Functions | |
| simple_file_parser (const simple_file_parser &obj) | |
| Private copy constructor. | |
| void | add_token (char *token) |
| Add a token. | |
| char * | is_continuation_line (char *line) const |
| Does this line continue? A continuation line has a backslash preceded by whitespace as the last character. | |
| simple_file_parser & | operator= (const simple_file_parser &obj) |
| Private assignment operator. | |
| char * | skip_comments (char *p) |
| Remove comments from the string. | |
| char * | skip_to_end_of_comment (char *p) |
| Skip to the end of a C-style comment. | |
| char * | skip_to_whitespace (char *p) |
| Skip to white space. | |
| char * | skip_whitespace (char *p) |
| Skip white space. | |
Private Attributes | |
| char * | m_buf |
| unsigned | m_bufsz |
| std::string | m_fn |
| std::ifstream * | m_ifs |
| bool | m_in_comment |
| unsigned | m_lineno |
| bool | m_single_char_tokens [256] |
| tokens_t | m_tokens |
| unsigned | m_v |
A very simple line oriented parser that tokenizes the data on each line.
It is convenient for handling very simple data file formats.
It ignores comments and blank lines automatically.
It supports C++-style single line commands as well as C-style multi-line comments. Nested C-style comments are NOT supported. Each line must be terminated by a semicolon. The semicolon at the end of the line is treated as a token.
Note that the semantics of the tokens is determmined by the user.
Here is a an example of an input file:
// Sample file token1 token2 token3; token4 token5 ;
"this is one token" 'this is "another"'
// multi-line example
tokena tokenb \
tokenc \
tokend
Note that double quotes cannot be embedded in a doubly quoted string and single quotes cannot be embedded in a singly quoted string.
Here is an example of how to use it in your code:
#include "simple_file_parser.h" #include <iostream> #include <iomanip> #include <map> #include <string> using namespace std; #define MYERR cerr << endl << "ERROR:" << __FILE__ << ":" << __LINE__ << ": " // Parse my data file with this format // [var] = [integer value] ; void parse(const string& file_name,map<string,unsigned>& vars) { simple_file_parser sfp; if (!sfp.open(file_name)) { MYERR << "cannot read file: '" << file_name << "'" << endl; exit(1); } // Enforce the syntax. // [var] = [uint val] ; // ^ ^ ^ ^ // | | | +--- end of record // | | +-------------- value // | +---------------- equals sign // +---------------------- var name while (!sfp.eof()) { if (!sfp.get_next_line(4,"","=","",";")) { MYERR << "syntax error -- expected 4 tokens" << " at line " << sfp.get_line_number() << " in " << sfp.get_file_name() << endl; } if (!sfp.is_token_uint(2)) { MYERR << "syntax error -- expected an unsigned integer" << " but found '" << sfp.get_token(2) << "'" << " at line " << sfp.get_line_number() << " in " << sfp.get_file_name() << endl; } string var = sfp.get_token(0); unsigned int val = sfp.get_token_uint(2); vars[var] = val; // store it in the collection } }
Here is what a general parsing (tokenizing) routine would look like:
#include "simple_file_parser.h" #include <iostream> #include <iomanip> using namespace std; #define MYERR cerr << endl << "ERROR:" << __FILE__ << ":" << __LINE__ << ": " void parse(const string& file_name) { simple_file_parser sfp; if (!sfp.open(file_name)) { MYERR << "cannot read file: '" << file_name << "'" << endl; exit(1); } // Read each line. Comments and blank lines are ignored. while (sfp.get_next_line()) { // Print out the tokens for each line. cout << "Line " << setw(4) << right << sfp.get_line_number() << left << " "; for(size_t i=0;i<sfp.get_num_tokens();i++) { if (i) { // print a blank prefix cout << " "; } cout << "token[" << setw(2) << setfill('0') << right << i << left << setfill(' ') << "] = " << "'" << sfp.get_token(i) << "'" << endl; } } }
Definition at line 152 of file simple_file_parser.h.
| typedef std::vector<std::string> simple_file_parser::tokens_t |
Container for the tokens.
Definition at line 158 of file simple_file_parser.h.
| simple_file_parser::simple_file_parser | ( | const simple_file_parser & | obj | ) | [private] |
| simple_file_parser::simple_file_parser | ( | ) |
Default constructor.
Definition at line 44 of file simple_file_parser.cc.
References m_buf, m_bufsz, and m_single_char_tokens.
00045 : m_ifs(0), 00046 m_lineno(0), 00047 m_bufsz(65536), 00048 m_in_comment(false) 00049 { 00050 m_buf = new char[m_bufsz]; 00051 size_t sz = sizeof(m_single_char_tokens)/sizeof(bool); 00052 for(size_t i=0;i<sz;++i) 00053 m_single_char_tokens[i] = false; 00054 }
| simple_file_parser::~simple_file_parser | ( | ) |
| void simple_file_parser::add_token | ( | char * | token | ) | [private] |
Add a token.
This is where the processing takes place to determine whether there are embedded single character tokens.
| token | The token to add. It may contain embedded single character tokens. |
Definition at line 220 of file simple_file_parser.cc.
References m_single_char_tokens, and m_tokens.
Referenced by get_next_line().
00221 { 00222 //MYDEBUG("add_token \""<<token<<"\""); 00223 char* cur = token; 00224 char* beg = token; 00225 for(;*cur;++cur) { 00226 size_t i = size_t(*cur); 00227 if (m_single_char_tokens[i]) { 00228 char single_char_token[2] = {*cur,0}; 00229 single_char_token[0] = *cur; 00230 *cur = 0; 00231 //MYDEBUG("found single character token '" << single_char_token << "' in \"" << token << "\""); 00232 //MYDEBUG("preceding token \"" << beg << "\""); 00233 if (*beg) // handle the case of a single character token 00234 m_tokens.push_back(beg); 00235 m_tokens.push_back(single_char_token); 00236 beg = cur+1; 00237 *cur = single_char_token[0]; 00238 } 00239 } 00240 if (!*cur && *beg) { 00241 //MYDEBUG("adding token \""<<beg<<"\""); 00242 m_tokens.push_back(beg); 00243 } 00244 }

| void simple_file_parser::close | ( | ) |
End the parsing operation.
This method can be called multiple times.
Definition at line 89 of file simple_file_parser.cc.
References m_fn, m_ifs, m_lineno, and m_tokens.
Referenced by ~simple_file_parser().
00090 { 00091 if (m_ifs) { 00092 m_lineno = 0; 00093 delete m_ifs; 00094 m_ifs = 0; 00095 m_tokens.clear(); 00096 m_fn = ""; 00097 } 00098 }

| bool simple_file_parser::eof | ( | ) | const |
| static bool simple_file_parser::file_exists | ( | const std::string & | fn | ) | [static] |
Does the file exist (const string parameter)?
| fn | The file name to check (const string&). |
| bool simple_file_parser::file_exists | ( | const char * | fn | ) | [static] |
Does the file exist?
| fn | The file name to check (const char*). |
Definition at line 63 of file simple_file_parser.cc.
| const std::string& simple_file_parser::get_file_name | ( | ) | const [inline] |
Get the file name.
Definition at line 221 of file simple_file_parser.h.
References m_fn.
Referenced by get_next_line().
00221 {return m_fn;}

| unsigned simple_file_parser::get_line_number | ( | ) | const [inline] |
Get the line number.
Definition at line 215 of file simple_file_parser.h.
References m_lineno.
00215 {return m_lineno;}
| bool simple_file_parser::get_next_line | ( | size_t | num_tokens, | |
| const std::string & | tok1 = "", |
|||
| const std::string & | tok2 = "", |
|||
| const std::string & | tok3 = "", |
|||
| const std::string & | tok4 = "", |
|||
| const std::string & | tok5 = "", |
|||
| const std::string & | tok6 = "", |
|||
| const std::string & | tok7 = "", |
|||
| const std::string & | tok8 = "" | |||
| ) |
Get tokens from next line with expected values.
Multi-line records are coalsced into a single record and comments are stripped out. Blank lines are ignored. If the expected values are not found, false is returned. If a token is "", it is not tested.
This function is extremely useful for looking for specific keywords in the data file.
The maximum of eight tokens was completely arbitrary.
| num_tokens | The expected number of tokens. | |
| tok1 | Expected value of the first token. It is blank if there is no expected value. | |
| tok2 | Expected value of the second token. It is blank if there is no expected value. | |
| tok3 | Expected value of the third token. It is blank if there is no expected value. | |
| tok4 | Expected value of the fourth token. It is blank if there is no expected value. | |
| tok5 | Expected value of the fifth token. It is blank if there is no expected value. | |
| tok6 | Expected value of the sixth token. It is blank if there is no expected value. | |
| tok7 | Expected value of the seventh token. It is blank if there is no expected value. | |
| tok8 | Expected value of the eighth token. It is blank if there is no expected value. |
| size_t simple_file_parser::get_next_line | ( | ) |
Get tokens from the next line.
Multi-line records are coalsced into a single record and comments are stripped out. Blank lines are ignored.
Definition at line 246 of file simple_file_parser.cc.
References add_token(), get_file_name(), get_num_tokens(), is_continuation_line(), m_buf, m_bufsz, m_ifs, m_lineno, m_tokens, skip_to_whitespace(), and skip_whitespace().
00247 { 00248 // Parsing happens here. 00249 m_buf[0] = 0; 00250 m_tokens.clear(); 00251 char *p = m_buf; 00252 while (!m_ifs->eof() && m_ifs->getline(m_buf,m_bufsz)) { 00253 m_lineno++; 00254 00255 // Check to see whether there is a backslash with preceding 00256 // whitespace at the end of the line, if so it is a continuation 00257 // line so data from the next line must be concatenated before 00258 // processing the tokens. All of the data is stored in m_buf. 00259 p = m_buf; 00260 char* eol = is_continuation_line(p); 00261 size_t sz = m_ifs->gcount(); 00262 while (eol && !m_ifs->eof()) { 00263 // Note that the starting point and the sz need to be adjusted 00264 // because we ignore the actual backslash character. 00265 --sz; 00266 p = eol-1; // ignore the backslash character 00267 if (sz>=m_bufsz) { 00268 break; 00269 } 00270 m_ifs->getline(p,m_bufsz-sz); 00271 m_lineno++; 00272 sz += m_ifs->gcount(); 00273 eol = is_continuation_line(p); 00274 } 00275 if (sz>=m_bufsz) { 00276 // We have a serious problem here because the buffer is not 00277 // large enough to contain the tokens for this record. There 00278 // are two choices, throw an exception or generate a fatal 00279 // error. For now I will generate a fatal error because these 00280 // should be an incredibly unlikely event. 00281 cerr << endl 00282 << "ERROR:" << __FILE__ << ":" << __LINE__ << ": " 00283 << "Internal buffer overflow, maximum of characters per line is " 00284 << (m_bufsz-1) << ", found " << sz 00285 << " at line " << m_lineno << " in " << get_file_name() 00286 << endl; 00287 exit(1); 00288 } 00289 00290 // Process the line. 00291 // This loop skips w/s only lines. 00292 p = skip_whitespace(m_buf); 00293 if (*p) { 00294 while (*p) { 00295 char* end = p; 00296 if (*end=='"') { 00297 // Special handling for double quoted entities. 00298 // Nested quote characters are not allowed. 00299 // Use the single quote if you want to nest double quotes. 00300 // Legal examples: 00301 // 1 "A" 00302 // 2 "A B" 00303 // 3 "A B's" 00304 // 4 "A // embedded comment is not a comment" 00305 // 5 "A /* xx */ C" 00306 // 00307 // Illegal examples: 00308 // 1 "A \"B\"" 00309 p++; // skip the initial " 00310 for(end++;*end!=0 && *end!='"' ;++end); 00311 } 00312 else if (*p=='\'') { 00313 // Special handling for single quoted entities. 00314 // Nested quote characters are not allowed. 00315 // Use the single quote if you want to nest double quotes. 00316 // Legal examples: 00317 // 1 'A' 00318 // 2 'A B' 00319 // 3 'A "B"' 00320 // 4 'A // embedded comment is not a comment' 00321 // 5 'A /* xx */ C' 00322 // 00323 // Illegal examples: 00324 // 1 'A \'B\'' 00325 p++; // skip the initial ' 00326 for(end++;*end!=0 && *end!='\'' ;++end); 00327 } 00328 else { 00329 // Any characters but w/s and comments. 00330 end = skip_to_whitespace(p); 00331 } 00332 char ec = *end; 00333 *end = 0; 00334 add_token(p); 00335 if (ec) { 00336 p = skip_whitespace(end+1); 00337 } 00338 else { 00339 *p = 0; 00340 } 00341 } 00342 break; 00343 } 00344 } 00345 return get_num_tokens(); 00346 }

| size_t simple_file_parser::get_num_tokens | ( | ) | const [inline] |
Get the number of tokens just parsed.
Definition at line 286 of file simple_file_parser.h.
References m_tokens.
Referenced by get_next_line().
00286 {return m_tokens.size();}

| const char * simple_file_parser::get_single_char_tokens | ( | ) | const |
Get the single character tokens.
Definition at line 127 of file simple_file_parser.cc.
References m_single_char_tokens.
00128 { 00129 const size_t cache_size = sizeof(m_single_char_tokens)/sizeof(bool); 00130 static char cache[cache_size]; 00131 char* p = cache; 00132 for(size_t i=0;i<cache_size;++i) { 00133 if (m_single_char_tokens[i]) { 00134 char ch = char(i); 00135 *p++ = ch; 00136 } 00137 } 00138 *p = 0; 00139 return cache; 00140 }
| const std::string& simple_file_parser::get_token | ( | size_t | i | ) | const [inline] |
Get the i-th token.
| i | The token index in the range [0..(N-1)] where N == get_num_tokens(). If i is out of range, an exception is thrown from the underlying std::vector. |
Definition at line 296 of file simple_file_parser.h.
References m_tokens.
00296 {return m_tokens[i];}
| unsigned simple_file_parser::get_token_int | ( | size_t | i | ) | const [inline] |
Get the i-th token as a signed integer.
| i | The token index in the range [0..(N-1)] where N == get_num_tokens(). If i is out of range, an exception is thrown from the underlying std::vector. |
Definition at line 326 of file simple_file_parser.h.
References m_tokens.
00326 {return atoi(m_tokens[i].c_str());}
| unsigned simple_file_parser::get_token_uint | ( | size_t | i | ) | const [inline] |
Get the i-th token as an unsigned integer.
| i | The token index in the range [0..(N-1)] where N == get_num_tokens(). If i is out of range, an exception is thrown from the underlying std::vector. |
Definition at line 306 of file simple_file_parser.h.
References m_tokens.
00306 {return atoi(m_tokens[i].c_str());}
| char * simple_file_parser::is_continuation_line | ( | char * | line | ) | const [private] |
Does this line continue? A continuation line has a backslash preceded by whitespace as the last character.
Definition at line 207 of file simple_file_parser.cc.
Referenced by get_next_line().
00208 { 00209 char* p = line; 00210 for(;*p;++p); // skip to the EOL 00211 if ((p-line)<2) { 00212 return 0; // 0 or 1 characters, cannot be a continuation line 00213 } 00214 if (p[-2] <= ' ' && p[-1] == '\\') { 00215 return p; 00216 } 00217 return 0; // not a continuation line 00218 }

| bool simple_file_parser::is_token_int | ( | size_t | i | ) | const |
Is the i-th token an integer?
| i | The token index in the range [0..(N-1)] where N == get_num_tokens(). If i is out of range, an exception is thrown from the underlying std::vector. |
Definition at line 391 of file simple_file_parser.cc.
References m_tokens.
00392 { 00393 const char* p = m_tokens[i].c_str(); 00394 00395 // leading '-' is okay 00396 if (*p!='-' && (*p<'0' || *p>'9')) 00397 return false; 00398 00399 for(++p;*p;p++) { 00400 if (*p<'0' || *p>'9') 00401 return false; 00402 } 00403 00404 return true; 00405 }
| bool simple_file_parser::is_token_uint | ( | size_t | i | ) | const |
Is the i-th token an unsigned integer?
| i | The token index in the range [0..(N-1)] where N == get_num_tokens(). If i is out of range, an exception is thrown from the underlying std::vector. |
Definition at line 380 of file simple_file_parser.cc.
References m_tokens.
00381 { 00382 const char* p = m_tokens[i].c_str(); 00383 for(;*p;p++) { 00384 if (*p<'0' || *p>'9') 00385 return false; 00386 } 00387 00388 return true; 00389 }
| bool simple_file_parser::open | ( | const std::string & | fn | ) |
Begin the parsing operation by opening the file and setting up the internal state.
| fn | The file to parse. |
| simple_file_parser & simple_file_parser::operator= | ( | const simple_file_parser & | obj | ) | [private] |
| void simple_file_parser::rewind | ( | ) |
| void simple_file_parser::set_single_char_tokens | ( | const char * | tokens | ) |
Define the single character tokens.
This is used to define single character tokens. Here is an example usage:
// Make ; and . separate tokens. sfp.set_single_char_tokens(";.");
| tokens | A string the contains the list of single character tokens. |
Definition at line 114 of file simple_file_parser.cc.
References m_single_char_tokens.
00115 { 00116 size_t sz = sizeof(m_single_char_tokens)/sizeof(bool); 00117 for(size_t i=0;i<sz;++i) 00118 m_single_char_tokens[i] = false; 00119 if (tokens) { 00120 for(const char* p = tokens;*p;++p) { 00121 size_t i = size_t(*p); 00122 m_single_char_tokens[i] = true; 00123 } 00124 } 00125 }
| char * simple_file_parser::skip_comments | ( | char * | p | ) | [private] |
Remove comments from the string.
| p | Pointer to the token. |
Definition at line 155 of file simple_file_parser.cc.
References m_in_comment, and skip_to_end_of_comment().
Referenced by skip_whitespace().
00156 { 00157 if (!*p) { 00158 return p; 00159 } 00160 00161 if (m_in_comment) { 00162 p = skip_to_end_of_comment(p); 00163 if (m_in_comment) { 00164 // A multi-line comment that did not end on this line. 00165 return p; 00166 } 00167 } 00168 00169 if (*p != '/') { 00170 // This can't possibly be a comment. 00171 return p; 00172 } 00173 00174 if (p[1] == '/') { 00175 // C++ style -- everything to the EOL can be ignored. 00176 for(;*p;++p); 00177 } 00178 else if (p[1] == '*') { 00179 // C style -- /* .. */ everything until the next EOL can be 00180 // ignored. 00181 p++; 00182 p++; 00183 m_in_comment=true; 00184 p = skip_to_end_of_comment(p); 00185 } 00186 return p; 00187 }


| char * simple_file_parser::skip_to_end_of_comment | ( | char * | p | ) | [private] |
Skip to the end of a C-style comment.
| p | Pointer to the character just past the comment. |
Definition at line 142 of file simple_file_parser.cc.
References m_in_comment.
Referenced by skip_comments().
00143 { 00144 for(;*p;++p) { 00145 if (*p == '*' && p[1] == '/') { 00146 p++; 00147 p++; // the first char after the '*/' 00148 m_in_comment=false; 00149 break; 00150 } 00151 } 00152 return p; 00153 }

| char * simple_file_parser::skip_to_whitespace | ( | char * | p | ) | [private] |
Skip to white space.
| p | Pointer to the string. |
Definition at line 201 of file simple_file_parser.cc.
Referenced by get_next_line().

| char * simple_file_parser::skip_whitespace | ( | char * | p | ) | [private] |
Skip white space.
| p | Pointer to the string. |
Definition at line 189 of file simple_file_parser.cc.
References skip_comments().
Referenced by get_next_line().
00190 { 00191 if (!*p) 00192 return p; 00193 for(;*p!=0 && (*p<=' ' || *p>=127);++p); 00194 p = skip_comments(p); 00195 if (*p && (*p<=' ' || *p>=127)) { 00196 p = skip_whitespace(p); // handles nested comments 00197 } 00198 return p; 00199 }


char* simple_file_parser::m_buf [private] |
Definition at line 407 of file simple_file_parser.h.
Referenced by get_next_line(), simple_file_parser(), and ~simple_file_parser().
unsigned simple_file_parser::m_bufsz [private] |
Definition at line 408 of file simple_file_parser.h.
Referenced by get_next_line(), simple_file_parser(), and ~simple_file_parser().
std::string simple_file_parser::m_fn [private] |
Definition at line 403 of file simple_file_parser.h.
Referenced by close(), and get_file_name().
std::ifstream* simple_file_parser::m_ifs [private] |
Definition at line 404 of file simple_file_parser.h.
Referenced by close(), eof(), get_next_line(), and rewind().
bool simple_file_parser::m_in_comment [private] |
Definition at line 411 of file simple_file_parser.h.
Referenced by skip_comments(), and skip_to_end_of_comment().
unsigned simple_file_parser::m_lineno [private] |
Definition at line 406 of file simple_file_parser.h.
Referenced by close(), get_line_number(), get_next_line(), and rewind().
bool simple_file_parser::m_single_char_tokens[256] [private] |
Definition at line 410 of file simple_file_parser.h.
Referenced by add_token(), get_single_char_tokens(), set_single_char_tokens(), and simple_file_parser().
tokens_t simple_file_parser::m_tokens [private] |
Definition at line 405 of file simple_file_parser.h.
Referenced by add_token(), close(), get_next_line(), get_num_tokens(), get_token(), get_token_int(), get_token_uint(), is_token_int(), is_token_uint(), and rewind().
unsigned simple_file_parser::m_v [private] |
Definition at line 409 of file simple_file_parser.h.
1.6.3