simple_file_parser Class Reference

A very simple line oriented parser that tokenizes the data on each line. More...

#include <simple_file_parser.h>

Public Types
typedef std::vector< std::string >	tokens_t
	Container for the tokens.
Public Member Functions
	simple_file_parser ()
	Default constructor.
	~simple_file_parser ()
	Destructor.
void	close ()
	End the parsing operation.
bool	eof () const
	Are we at the end of the file?
const std::string &	get_file_name () const
	Get the file name.
unsigned	get_line_number () const
	Get the line number.
bool	get_next_line (size_t num_tokens, const std::string &tok1="", const std::string &tok2="", const std::string &tok3="", const std::string &tok4="", const std::string &tok5="", const std::string &tok6="", const std::string &tok7="", const std::string &tok8="")
	Get tokens from next line with expected values.
size_t	get_next_line ()
	Get tokens from the next line.
size_t	get_num_tokens () const
	Get the number of tokens just parsed.
const char *	get_single_char_tokens () const
	Get the single character tokens.
const std::string &	get_token (size_t i) const
	Get the i-th token.
unsigned	get_token_int (size_t i) const
	Get the i-th token as a signed integer.
unsigned	get_token_uint (size_t i) const
	Get the i-th token as an unsigned integer.
bool	is_token_int (size_t i) const
	Is the i-th token an integer?
bool	is_token_uint (size_t i) const
	Is the i-th token an unsigned integer?
bool	open (const std::string &fn)
	Begin the parsing operation by opening the file and setting up the internal state.
void	rewind ()
	Rewind to the beginning of the file.
void	set_single_char_tokens (const char *tokens)
	Define the single character tokens.
Static Public Member Functions
static bool	file_exists (const std::string &fn)
	Does the file exist (const string parameter)?
static bool	file_exists (const char *fn)
	Does the file exist?
Private Member Functions
	simple_file_parser (const simple_file_parser &obj)
	Private copy constructor.
void	add_token (char *token)
	Add a token.
char *	is_continuation_line (char *line) const
	Does this line continue? A continuation line has a backslash preceded by whitespace as the last character.
simple_file_parser &	operator= (const simple_file_parser &obj)
	Private assignment operator.
char *	skip_comments (char *p)
	Remove comments from the string.
char *	skip_to_end_of_comment (char *p)
	Skip to the end of a C-style comment.
char *	skip_to_whitespace (char *p)
	Skip to white space.
char *	skip_whitespace (char *p)
	Skip white space.
Private Attributes
char *	m_buf
unsigned	m_bufsz
std::string	m_fn
std::ifstream *	m_ifs
bool	m_in_comment
unsigned	m_lineno
bool	m_single_char_tokens [256]
tokens_t	m_tokens
unsigned	m_v

Detailed Description

A very simple line oriented parser that tokenizes the data on each line.

It is convenient for handling very simple data file formats.

It ignores comments and blank lines automatically.

It supports C++-style single line commands as well as C-style multi-line comments. Nested C-style comments are NOT supported. Each line must be terminated by a semicolon. The semicolon at the end of the line is treated as a token.

Note that the semantics of the tokens is determmined by the user.

Here is a an example of an input file:

   // Sample file
   token1 token2 token3;
   token4 token5 ;

   "this is one token" 'this is "another"'

   // multi-line example
   tokena tokenb \
      tokenc \
      tokend

Note that double quotes cannot be embedded in a doubly quoted string and single quotes cannot be embedded in a singly quoted string.

Here is an example of how to use it in your code:

 #include "simple_file_parser.h"
 #include <iostream>
 #include <iomanip>
 #include <map>
 #include <string>
 using namespace std;

 #define MYERR cerr << endl << "ERROR:" << __FILE__ << ":" << __LINE__ << ": "

 // Parse my data file with this format
 // [var] = [integer value] ;
 void parse(const string& file_name,map<string,unsigned>& vars)
 {
    simple_file_parser sfp;
    if (!sfp.open(file_name)) {
       MYERR << "cannot read file: '" << file_name << "'" << endl;
       exit(1);
    }

    // Enforce the syntax.
    //  [var] = [uint val] ;
    //  ^     ^ ^          ^
    //  |     | |          +--- end of record
    //  |     | +-------------- value
    //  |     +---------------- equals sign
    //  +---------------------- var name
    while (!sfp.eof()) {
       if (!sfp.get_next_line(4,"","=","",";")) {
          MYERR << "syntax error -- expected 4 tokens"
                << " at line " << sfp.get_line_number()
                << " in " << sfp.get_file_name()
                << endl;
       }
       if (!sfp.is_token_uint(2)) {
          MYERR << "syntax error -- expected an unsigned integer"
                << " but found '" << sfp.get_token(2) << "'"
                << " at line " << sfp.get_line_number()
                << " in " << sfp.get_file_name()
                << endl;
       }
       string var = sfp.get_token(0);
       unsigned int val = sfp.get_token_uint(2);
       vars[var] = val; // store it in the collection
    }
 }

Here is what a general parsing (tokenizing) routine would look like:

 #include "simple_file_parser.h"
 #include <iostream>
 #include <iomanip>
 using namespace std;

 #define MYERR cerr << endl << "ERROR:" << __FILE__ << ":" << __LINE__ << ": "

 void parse(const string& file_name)
 {
    simple_file_parser sfp;
    if (!sfp.open(file_name)) {
       MYERR << "cannot read file: '" << file_name << "'" << endl;
       exit(1);
    }

    // Read each line. Comments and blank lines are ignored.
    while (sfp.get_next_line()) {
       // Print out the tokens for each line.
       cout << "Line " << setw(4) << right << sfp.get_line_number() << left << " ";
       for(size_t i=0;i<sfp.get_num_tokens();i++) {
          if (i) { // print a blank prefix
             cout << "          ";
          }
          cout << "token[" 
               << setw(2) << setfill('0') << right << i
               << left << setfill(' ') << "] = " 
               << "'" << sfp.get_token(i) << "'"
               << endl;
       }
    }
 }

Definition at line 152 of file simple_file_parser.h.

Member Typedef Documentation

typedef std::vector<std::string> simple_file_parser::tokens_t

Container for the tokens.

Definition at line 158 of file simple_file_parser.h.

Constructor & Destructor Documentation

simple_file_parser::simple_file_parser ( const simple_file_parser & obj ) [private]

Private copy constructor.

Not copyable.

Definition at line 35 of file simple_file_parser.cc.

00036 {
00037 }

simple_file_parser::simple_file_parser ( )

Default constructor.

Definition at line 44 of file simple_file_parser.cc.

References m_buf, m_bufsz, and m_single_char_tokens.

00045   : m_ifs(0),
00046     m_lineno(0),
00047     m_bufsz(65536),
00048     m_in_comment(false)
00049 {
00050   m_buf = new char[m_bufsz];
00051   size_t sz = sizeof(m_single_char_tokens)/sizeof(bool);
00052   for(size_t i=0;i<sz;++i)
00053     m_single_char_tokens[i] = false;
00054 }

simple_file_parser::~simple_file_parser ( )

Destructor.

Definition at line 56 of file simple_file_parser.cc.

References close(), m_buf, and m_bufsz.

00057 {
00058   close();
00059   delete [] m_buf;
00060   m_bufsz = 0;
00061 }

Here is the call graph for this function:

Member Function Documentation

void simple_file_parser::add_token ( char * token ) [private]

Add a token.

This is where the processing takes place to determine whether there are embedded single character tokens.

Parameters:

token

The token to add. It may contain embedded single character tokens.

Definition at line 220 of file simple_file_parser.cc.

References m_single_char_tokens, and m_tokens.

Referenced by get_next_line().

00222 00223 00224 00225 00226 00227 00228 00229 00230 00231 00232 00233 00234 00235 00236 00237 00238     } 00239   } 00240 00241 00242 00243   } 00244 }

class="fragment">00221 { //MYDEBUG("add_token \""<<token<<"\""); char* cur = token; char* beg = token; for(;*cur;++cur) { size_t i = size_t(*cur); if (m_single_char_tokens[i]) { char single_char_token[2] = {*cur,0}; single_char_token[0] = *cur; *cur = 0; //MYDEBUG("found single character token '" << single_char_token << "' in \"" << token << "\""); //MYDEBUG("preceding token \"" << beg << "\""); if (*beg) // handle the case of a single character token m_tokens.push_back(beg); m_tokens.push_back(single_char_token); beg = cur+1; *cur = single_char_token[0]; if (!*cur && *beg) { //MYDEBUG("adding token \""<<beg<<"\""); m_tokens.push_back(beg);

Here is the caller graph for this function:

void simple_file_parser::close ( )

End the parsing operation.

This method can be called multiple times.

Definition at line 89 of file simple_file_parser.cc.

References m_fn, m_ifs, m_lineno, and m_tokens.

Referenced by ~simple_file_parser().

00090 {
00091   if (m_ifs) {
00092     m_lineno = 0;
00093     delete m_ifs;
00094     m_ifs = 0;
00095     m_tokens.clear();
00096     m_fn = "";
00097   }
00098 }

Here is the caller graph for this function:

bool simple_file_parser::eof ( ) const

Are we at the end of the file?

Returns:: True if we are the eof() or false otherwise.

Definition at line 100 of file simple_file_parser.cc.

References m_ifs.

00101 {
00102   return m_ifs ? m_ifs->eof() : true;
00103 }

static bool simple_file_parser::file_exists ( const std::string & fn ) [static]

Does the file exist (const string parameter)?

Parameters:

fn

The file name to check (const string&).

Returns:: True if it does or false otherwise.

bool simple_file_parser::file_exists ( const char * fn ) [static]

Does the file exist?

Parameters:

fn

The file name to check (const char*).

Returns:: True if it does or false otherwise.

Definition at line 63 of file simple_file_parser.cc.

00064 {
00065   ifstream ifs(fn);
00066   if (!ifs || ifs.bad())
00067     return false;
00068   return true;
00069 }

const std::string& simple_file_parser::get_file_name ( ) const [inline]

Get the file name.

Returns:: the file name.

Definition at line 221 of file simple_file_parser.h.

References m_fn.

Referenced by get_next_line().

00221 {return m_fn;}

Here is the caller graph for this function:

unsigned simple_file_parser::get_line_number ( ) const [inline]

Get the line number.

Returns:: the current line number.

Definition at line 215 of file simple_file_parser.h.

References m_lineno.

00215 {return m_lineno;}

bool simple_file_parser::get_next_line	(	size_t	num_tokens,
		const std::string &	tok1 = `""`,
		const std::string &	tok2 = `""`,
		const std::string &	tok3 = `""`,
		const std::string &	tok4 = `""`,
		const std::string &	tok5 = `""`,
		const std::string &	tok6 = `""`,
		const std::string &	tok7 = `""`,
		const std::string &	tok8 = `""`
	)

Get tokens from next line with expected values.

Multi-line records are coalsced into a single record and comments are stripped out. Blank lines are ignored. If the expected values are not found, false is returned. If a token is "", it is not tested.

This function is extremely useful for looking for specific keywords in the data file.

The maximum of eight tokens was completely arbitrary.

Parameters:

	num_tokens	The expected number of tokens.
	tok1	Expected value of the first token. It is blank if there is no expected value.
	tok2	Expected value of the second token. It is blank if there is no expected value.
	tok3	Expected value of the third token. It is blank if there is no expected value.
	tok4	Expected value of the fourth token. It is blank if there is no expected value.
	tok5	Expected value of the fifth token. It is blank if there is no expected value.
	tok6	Expected value of the sixth token. It is blank if there is no expected value.
	tok7	Expected value of the seventh token. It is blank if there is no expected value.
	tok8	Expected value of the eighth token. It is blank if there is no expected value.

Returns:: True if all of the conditions passed.

size_t simple_file_parser::get_next_line ( )

Get tokens from the next line.

Multi-line records are coalsced into a single record and comments are stripped out. Blank lines are ignored.

Returns:: The number of tokens.

Definition at line 246 of file simple_file_parser.cc.

References add_token(), get_file_name(), get_num_tokens(), is_continuation_line(), m_buf, m_bufsz, m_ifs, m_lineno, m_tokens, skip_to_whitespace(), and skip_whitespace().

00247 {
00248   // Parsing happens here.
00249   m_buf[0] = 0;
00250   m_tokens.clear();
00251   char *p = m_buf;
00252   while (!m_ifs->eof() && m_ifs->getline(m_buf,m_bufsz)) {
00253     m_lineno++;
00254 
00255     // Check to see whether there is a backslash with preceding
00256     // whitespace at the end of the line, if so it is a continuation
00257     // line so data from the next line must be concatenated before
00258     // processing the tokens. All of the data is stored in m_buf.
00259     p = m_buf;
00260     char* eol = is_continuation_line(p);
00261     size_t sz = m_ifs->gcount();
00262     while (eol && !m_ifs->eof()) {
00263       // Note that the starting point and the sz need to be adjusted
00264       // because we ignore the actual backslash character.
00265       --sz;
00266       p = eol-1; // ignore the backslash character
00267       if (sz>=m_bufsz) {
00268         break;
00269       }
00270       m_ifs->getline(p,m_bufsz-sz);
00271       m_lineno++;
00272       sz += m_ifs->gcount();
00273       eol = is_continuation_line(p);
00274     }
00275     if (sz>=m_bufsz) {
00276       // We have a serious problem here because the buffer is not
00277       // large enough to contain the tokens for this record. There
00278       // are two choices, throw an exception or generate a fatal
00279       // error. For now I will generate a fatal error because these
00280       // should be an incredibly unlikely event.
00281       cerr << endl
00282            << "ERROR:" << __FILE__ << ":" << __LINE__ << ": "
00283            << "Internal buffer overflow, maximum of characters per line is "
00284            << (m_bufsz-1) << ", found " << sz
00285            << " at line " << m_lineno << " in " << get_file_name()
00286            << endl;
00287       exit(1);
00288     }
00289 
00290     // Process the line.
00291     // This loop skips w/s only lines.
00292     p = skip_whitespace(m_buf);
00293     if (*p) {
00294       while (*p) {
00295         char* end = p;
00296         if (*end=='"') {
00297           // Special handling for double quoted entities.
00298           // Nested quote characters are not allowed.
00299           // Use the single quote if you want to nest double quotes.
00300           // Legal examples:
00301           //    1 "A"
00302           //    2 "A B"
00303           //    3 "A B's"
00304           //    4 "A // embedded comment is not a comment"
00305           //    5 "A /* xx */ C"
00306           //
00307           // Illegal examples:
00308           //    1 "A \"B\""
00309           p++; // skip the initial "
00310           for(end++;*end!=0 && *end!='"' ;++end);
00311         }
00312         else if (*p=='\'') {
00313           // Special handling for single quoted entities.
00314           // Nested quote characters are not allowed.
00315           // Use the single quote if you want to nest double quotes.
00316           // Legal examples:
00317           //    1 'A'
00318           //    2 'A B'
00319           //    3 'A "B"'
00320           //    4 'A // embedded comment is not a comment'
00321           //    5 'A /* xx */ C'
00322           //
00323           // Illegal examples:
00324           //    1 'A \'B\''
00325           p++; // skip the initial '
00326           for(end++;*end!=0 && *end!='\'' ;++end);
00327         }
00328         else {
00329           // Any characters but w/s and comments.
00330           end = skip_to_whitespace(p);
00331         }
00332         char ec = *end;
00333         *end = 0;
00334         add_token(p);
00335         if (ec) {
00336           p = skip_whitespace(end+1);
00337         }
00338         else {
00339           *p = 0;
00340         }
00341       }
00342       break;
00343     }
00344   }
00345   return get_num_tokens();
00346 }

Here is the call graph for this function:

size_t simple_file_parser::get_num_tokens ( ) const [inline]

Get the number of tokens just parsed.

Returns:: The number of parsed tokens.

Definition at line 286 of file simple_file_parser.h.

References m_tokens.

Referenced by get_next_line().

00286 {return m_tokens.size();}

Here is the caller graph for this function:

const char * simple_file_parser::get_single_char_tokens ( ) const

Get the single character tokens.

Returns:: the single character tokens in a string. If there are no tokens, an empty string it returned.

Definition at line 127 of file simple_file_parser.cc.

References m_single_char_tokens.

00128 {
00129   const size_t cache_size = sizeof(m_single_char_tokens)/sizeof(bool);
00130   static char cache[cache_size];
00131   char* p = cache;
00132   for(size_t i=0;i<cache_size;++i) {
00133     if (m_single_char_tokens[i]) {
00134       char ch = char(i);
00135       *p++ = ch;
00136     }
00137   }
00138   *p = 0;
00139   return cache;
00140 }

const std::string& simple_file_parser::get_token ( size_t i ) const [inline]

Get the i-th token.

Parameters:

i

The token index in the range [0..(N-1)] where N == get_num_tokens(). If i is out of range, an exception is thrown from the underlying std::vector.

Returns:: The i-th token.

Definition at line 296 of file simple_file_parser.h.

References m_tokens.

00296 {return m_tokens[i];}

unsigned simple_file_parser::get_token_int ( size_t i ) const [inline]

Get the i-th token as a signed integer.

Parameters:

i

The token index in the range [0..(N-1)] where N == get_num_tokens(). If i is out of range, an exception is thrown from the underlying std::vector.

Returns:: The i-th token as a signed integer.

Definition at line 326 of file simple_file_parser.h.

References m_tokens.

00326 {return atoi(m_tokens[i].c_str());}

unsigned simple_file_parser::get_token_uint ( size_t i ) const [inline]

Get the i-th token as an unsigned integer.

Parameters:

i

The token index in the range [0..(N-1)] where N == get_num_tokens(). If i is out of range, an exception is thrown from the underlying std::vector.

Returns:: The i-th token as an unsigned integer.

Definition at line 306 of file simple_file_parser.h.

References m_tokens.

00306 {return atoi(m_tokens[i].c_str());}

char * simple_file_parser::is_continuation_line ( char * line ) const [private]

Does this line continue? A continuation line has a backslash preceded by whitespace as the last character.

Returns:: The end of the line if it is a continuation line or NULL if it is not.

Definition at line 207 of file simple_file_parser.cc.

Referenced by get_next_line().

00208 {
00209   char* p = line;
00210   for(;*p;++p); // skip to the EOL
00211   if ((p-line)<2) {
00212     return 0; // 0 or 1 characters, cannot be a continuation line
00213   }
00214   if (p[-2] <= ' ' && p[-1] == '\\') {
00215     return p;
00216   }
00217   return 0; // not a continuation line
00218 }

Here is the caller graph for this function:

bool simple_file_parser::is_token_int ( size_t i ) const

Is the i-th token an integer?

Parameters:

i

The token index in the range [0..(N-1)] where N == get_num_tokens(). If i is out of range, an exception is thrown from the underlying std::vector.

Returns:: true if it is or false otherwise.

Definition at line 391 of file simple_file_parser.cc.

References m_tokens.

00392 {
00393   const char* p = m_tokens[i].c_str();
00394 
00395   // leading '-' is okay
00396   if (*p!='-' && (*p<'0' || *p>'9'))
00397     return false;
00398 
00399   for(++p;*p;p++) {
00400     if (*p<'0' || *p>'9')
00401       return false;
00402   }
00403 
00404   return true;
00405 }

bool simple_file_parser::is_token_uint ( size_t i ) const

Is the i-th token an unsigned integer?

Parameters:

i

The token index in the range [0..(N-1)] where N == get_num_tokens(). If i is out of range, an exception is thrown from the underlying std::vector.

Returns:: true if it is or false otherwise.

Definition at line 380 of file simple_file_parser.cc.

References m_tokens.

00381 {
00382   const char* p = m_tokens[i].c_str();
00383   for(;*p;p++) {
00384     if (*p<'0' || *p>'9')
00385       return false;
00386   }
00387 
00388   return true;
00389 }

bool simple_file_parser::open ( const std::string & fn )

Begin the parsing operation by opening the file and setting up the internal state.

Parameters:

fn

The file to parse.

Returns:: true if the file was opened and is ready to parse or false otherwise.

simple_file_parser & simple_file_parser::operator= ( const simple_file_parser & obj ) [private]

Private assignment operator.

Not assignable.

Definition at line 39 of file simple_file_parser.cc.

00040 {
00041   return *this;
00042 }

void simple_file_parser::rewind ( )

Rewind to the beginning of the file.

Definition at line 105 of file simple_file_parser.cc.

References m_ifs, m_lineno, and m_tokens.

00106 {
00107   if (m_ifs) {
00108     m_ifs->seekg(0);
00109     m_lineno = 0;
00110     m_tokens.clear();
00111   }
00112 }

void simple_file_parser::set_single_char_tokens ( const char * tokens )

Define the single character tokens.

This is used to define single character tokens. Here is an example usage:

    // Make ; and . separate tokens.
    sfp.set_single_char_tokens(";.");

Parameters:

tokens

A string the contains the list of single character tokens.

Definition at line 114 of file simple_file_parser.cc.

References m_single_char_tokens.

00115 {
00116   size_t sz = sizeof(m_single_char_tokens)/sizeof(bool);
00117   for(size_t i=0;i<sz;++i)
00118     m_single_char_tokens[i] = false;
00119   if (tokens) {
00120     for(const char* p = tokens;*p;++p) {
00121       size_t i = size_t(*p);
00122       m_single_char_tokens[i] = true;
00123     }
00124   }
00125 }

char * simple_file_parser::skip_comments ( char * p ) [private]

Remove comments from the string.

Parameters:

p

Pointer to the token.

Returns:: the updated pointer.

Definition at line 155 of file simple_file_parser.cc.

References m_in_comment, and skip_to_end_of_comment().

Referenced by skip_whitespace().

00156 {
00157   if (!*p) {
00158     return p;
00159   }
00160   
00161   if (m_in_comment) {
00162     p = skip_to_end_of_comment(p);
00163     if (m_in_comment) {
00164       // A multi-line comment that did not end on this line.
00165       return p;
00166     }
00167   }
00168   
00169   if (*p != '/') {
00170     // This can't possibly be a comment.
00171     return p;
00172   }
00173   
00174   if (p[1] == '/') {
00175     // C++ style -- everything to the EOL can be ignored.
00176     for(;*p;++p);
00177   }
00178   else if (p[1] == '*') {
00179     // C style -- /* .. */ everything until the next EOL can be
00180     //            ignored.
00181     p++;
00182     p++;
00183     m_in_comment=true;
00184     p = skip_to_end_of_comment(p);
00185   }
00186   return p;
00187 }

Here is the call graph for this function:

Here is the caller graph for this function:

char * simple_file_parser::skip_to_end_of_comment ( char * p ) [private]

Skip to the end of a C-style comment.

Parameters:

p

Pointer to the character just past the comment.

Returns:: the updated pointer.

Definition at line 142 of file simple_file_parser.cc.

References m_in_comment.

Referenced by skip_comments().

00143 {
00144   for(;*p;++p) {
00145     if (*p == '*' && p[1] == '/') {
00146       p++;
00147       p++; // the first char after the '*/'
00148       m_in_comment=false;
00149       break;
00150     }
00151   }
00152   return p;
00153 }

Here is the caller graph for this function:

char * simple_file_parser::skip_to_whitespace ( char * p ) [private]

Skip to white space.

Parameters:

p

Pointer to the string.

Returns:: The pointer to the next white space.

Definition at line 201 of file simple_file_parser.cc.

Referenced by get_next_line().

00202 {
00203   for(;*p!=0 && *p>' ' && *p<127;++p);
00204   return p;
00205 }

Here is the caller graph for this function:

char * simple_file_parser::skip_whitespace ( char * p ) [private]

Skip white space.

Parameters:

p

Pointer to the string.

Returns:: The pointer to the next token.

Definition at line 189 of file simple_file_parser.cc.

References skip_comments().

Referenced by get_next_line().

00190 {
00191   if (!*p)
00192     return p;
00193   for(;*p!=0 && (*p<=' ' || *p>=127);++p);
00194   p = skip_comments(p);
00195   if (*p && (*p<=' ' || *p>=127)) {
00196     p = skip_whitespace(p); // handles nested comments
00197   }
00198   return p;
00199 }