simple_file_parser.cc

Go to the documentation of this file.
00001 // ================================================================
00002 // $Id: simple_file_parser.cc,v 1.2 2010/08/02 16:59:30 jdl Exp jdl $
00003 //
00004 // Copyright (C) 2010 Joe Linoff
00005 //
00006 // This source code is free software: you can redistribute it and/or modify
00007 // it under the terms of the GNU General Public License as published by
00008 // the Free Software Foundation, either version 3 of the License, or
00009 // (at your option) any later version.
00010 //
00011 // This code is distributed in the hope that it will be useful,
00012 // but WITHOUT ANY WARRANTY; without even the implied warranty of
00013 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014 // GNU General Public License for more details.
00015 //
00016 // You should have received a copy of the GNU General Public License
00017 // along with this program. If not, see http://www.gnu.org/licenses/.
00018 //
00019 // ----------------------------------------------------------------
00020 //
00021 // Simple file parser that carries over some of the semantics
00022 // from the LALR(n) parser. Namely it supports C++ style
00023 // single line comments (//), multi-line C style comments
00024 // and it ignores blank lines.
00025 //
00026 // It was developed to allow use to use a single paradigm
00027 // for parsing the sites, tnames and nodes files.
00028 // ================================================================
00029 #include <iostream>
00030 #include "simple_file_parser.h"
00031 using namespace std;
00032 
00033 #define MYDEBUG(msg) cout << "DEBUG:" << __FILE__ << ":" << __LINE__ << ": " << msg << endl
00034 
00035 simple_file_parser::simple_file_parser(const simple_file_parser& obj)
00036 {
00037 }
00038 
00039 simple_file_parser& simple_file_parser::operator=(const simple_file_parser& obj)
00040 {
00041   return *this;
00042 }
00043 
00044 simple_file_parser::simple_file_parser()
00045   : m_ifs(0),
00046     m_lineno(0),
00047     m_bufsz(65536),
00048     m_in_comment(false)
00049 {
00050   m_buf = new char[m_bufsz];
00051   size_t sz = sizeof(m_single_char_tokens)/sizeof(bool);
00052   for(size_t i=0;i<sz;++i)
00053     m_single_char_tokens[i] = false;
00054 }
00055 
00056 simple_file_parser::~simple_file_parser()
00057 {
00058   close();
00059   delete [] m_buf;
00060   m_bufsz = 0;
00061 }
00062 
00063 bool simple_file_parser::file_exists(const char* fn)
00064 {
00065   ifstream ifs(fn);
00066   if (!ifs || ifs.bad())
00067     return false;
00068   return true;
00069 }
00070 
00071 bool simple_file_parser::file_exists(const string& fn) {
00072   return file_exists(fn.c_str());
00073 }
00074 
00075 bool simple_file_parser::open(const string& fn)
00076 {
00077   if (!file_exists(fn))
00078     return false;
00079   close();
00080   m_ifs = new ifstream(fn.c_str());
00081   if (!*m_ifs || m_ifs->bad()) {
00082     close();
00083     return false;
00084   }
00085   m_fn = fn;
00086   return true;
00087 }
00088 
00089 void simple_file_parser::close()
00090 {
00091   if (m_ifs) {
00092     m_lineno = 0;
00093     delete m_ifs;
00094     m_ifs = 0;
00095     m_tokens.clear();
00096     m_fn = "";
00097   }
00098 }
00099 
00100 bool simple_file_parser::eof() const
00101 {
00102   return m_ifs ? m_ifs->eof() : true;
00103 }
00104 
00105 void simple_file_parser::rewind()
00106 {
00107   if (m_ifs) {
00108     m_ifs->seekg(0);
00109     m_lineno = 0;
00110     m_tokens.clear();
00111   }
00112 }
00113 
00114 void simple_file_parser::set_single_char_tokens(const char* tokens)
00115 {
00116   size_t sz = sizeof(m_single_char_tokens)/sizeof(bool);
00117   for(size_t i=0;i<sz;++i)
00118     m_single_char_tokens[i] = false;
00119   if (tokens) {
00120     for(const char* p = tokens;*p;++p) {
00121       size_t i = size_t(*p);
00122       m_single_char_tokens[i] = true;
00123     }
00124   }
00125 }
00126 
00127 const char* simple_file_parser::get_single_char_tokens() const
00128 {
00129   const size_t cache_size = sizeof(m_single_char_tokens)/sizeof(bool);
00130   static char cache[cache_size];
00131   char* p = cache;
00132   for(size_t i=0;i<cache_size;++i) {
00133     if (m_single_char_tokens[i]) {
00134       char ch = char(i);
00135       *p++ = ch;
00136     }
00137   }
00138   *p = 0;
00139   return cache;
00140 }
00141 
00142 char* simple_file_parser::skip_to_end_of_comment(char* p)
00143 {
00144   for(;*p;++p) {
00145     if (*p == '*' && p[1] == '/') {
00146       p++;
00147       p++; // the first char after the '*/'
00148       m_in_comment=false;
00149       break;
00150     }
00151   }
00152   return p;
00153 }
00154 
00155 char* simple_file_parser::skip_comments(char* p)
00156 {
00157   if (!*p) {
00158     return p;
00159   }
00160   
00161   if (m_in_comment) {
00162     p = skip_to_end_of_comment(p);
00163     if (m_in_comment) {
00164       // A multi-line comment that did not end on this line.
00165       return p;
00166     }
00167   }
00168   
00169   if (*p != '/') {
00170     // This can't possibly be a comment.
00171     return p;
00172   }
00173   
00174   if (p[1] == '/') {
00175     // C++ style -- everything to the EOL can be ignored.
00176     for(;*p;++p);
00177   }
00178   else if (p[1] == '*') {
00179     // C style -- /* .. */ everything until the next EOL can be
00180     //            ignored.
00181     p++;
00182     p++;
00183     m_in_comment=true;
00184     p = skip_to_end_of_comment(p);
00185   }
00186   return p;
00187 }
00188 
00189 char* simple_file_parser::skip_whitespace(char* p)
00190 {
00191   if (!*p)
00192     return p;
00193   for(;*p!=0 && (*p<=' ' || *p>=127);++p);
00194   p = skip_comments(p);
00195   if (*p && (*p<=' ' || *p>=127)) {
00196     p = skip_whitespace(p); // handles nested comments
00197   }
00198   return p;
00199 }
00200 
00201 char* simple_file_parser::skip_to_whitespace(char* p)
00202 {
00203   for(;*p!=0 && *p>' ' && *p<127;++p);
00204   return p;
00205 }
00206 
00207 char* simple_file_parser::is_continuation_line(char* line) const
00208 {
00209   char* p = line;
00210   for(;*p;++p); // skip to the EOL
00211   if ((p-line)<2) {
00212     return 0; // 0 or 1 characters, cannot be a continuation line
00213   }
00214   if (p[-2] <= ' ' && p[-1] == '\\') {
00215     return p;
00216   }
00217   return 0; // not a continuation line
00218 }
00219 
00220 void simple_file_parser::add_token(char* token)
00221 {
00222   //MYDEBUG("add_token \""<<token<<"\"");
00223   char* cur = token;
00224   char* beg = token;
00225   for(;*cur;++cur) {
00226     size_t i = size_t(*cur);
00227     if (m_single_char_tokens[i]) {
00228       char single_char_token[2] = {*cur,0};
00229       single_char_token[0] = *cur;
00230       *cur = 0;
00231       //MYDEBUG("found single character token '" << single_char_token << "' in \"" << token << "\"");
00232       //MYDEBUG("preceding token \"" << beg << "\"");
00233       if (*beg) // handle the case of a single character token
00234         m_tokens.push_back(beg);
00235       m_tokens.push_back(single_char_token);
00236       beg = cur+1;
00237       *cur = single_char_token[0];
00238     }
00239   }
00240   if (!*cur && *beg) {
00241     //MYDEBUG("adding token \""<<beg<<"\"");
00242     m_tokens.push_back(beg);
00243   }
00244 }
00245 
00246 size_t simple_file_parser::get_next_line()
00247 {
00248   // Parsing happens here.
00249   m_buf[0] = 0;
00250   m_tokens.clear();
00251   char *p = m_buf;
00252   while (!m_ifs->eof() && m_ifs->getline(m_buf,m_bufsz)) {
00253     m_lineno++;
00254 
00255     // Check to see whether there is a backslash with preceding
00256     // whitespace at the end of the line, if so it is a continuation
00257     // line so data from the next line must be concatenated before
00258     // processing the tokens. All of the data is stored in m_buf.
00259     p = m_buf;
00260     char* eol = is_continuation_line(p);
00261     size_t sz = m_ifs->gcount();
00262     while (eol && !m_ifs->eof()) {
00263       // Note that the starting point and the sz need to be adjusted
00264       // because we ignore the actual backslash character.
00265       --sz;
00266       p = eol-1; // ignore the backslash character
00267       if (sz>=m_bufsz) {
00268         break;
00269       }
00270       m_ifs->getline(p,m_bufsz-sz);
00271       m_lineno++;
00272       sz += m_ifs->gcount();
00273       eol = is_continuation_line(p);
00274     }
00275     if (sz>=m_bufsz) {
00276       // We have a serious problem here because the buffer is not
00277       // large enough to contain the tokens for this record. There
00278       // are two choices, throw an exception or generate a fatal
00279       // error. For now I will generate a fatal error because these
00280       // should be an incredibly unlikely event.
00281       cerr << endl
00282            << "ERROR:" << __FILE__ << ":" << __LINE__ << ": "
00283            << "Internal buffer overflow, maximum of characters per line is "
00284            << (m_bufsz-1) << ", found " << sz
00285            << " at line " << m_lineno << " in " << get_file_name()
00286            << endl;
00287       exit(1);
00288     }
00289 
00290     // Process the line.
00291     // This loop skips w/s only lines.
00292     p = skip_whitespace(m_buf);
00293     if (*p) {
00294       while (*p) {
00295         char* end = p;
00296         if (*end=='"') {
00297           // Special handling for double quoted entities.
00298           // Nested quote characters are not allowed.
00299           // Use the single quote if you want to nest double quotes.
00300           // Legal examples:
00301           //    1 "A"
00302           //    2 "A B"
00303           //    3 "A B's"
00304           //    4 "A // embedded comment is not a comment"
00305           //    5 "A /* xx */ C"
00306           //
00307           // Illegal examples:
00308           //    1 "A \"B\""
00309           p++; // skip the initial "
00310           for(end++;*end!=0 && *end!='"' ;++end);
00311         }
00312         else if (*p=='\'') {
00313           // Special handling for single quoted entities.
00314           // Nested quote characters are not allowed.
00315           // Use the single quote if you want to nest double quotes.
00316           // Legal examples:
00317           //    1 'A'
00318           //    2 'A B'
00319           //    3 'A "B"'
00320           //    4 'A // embedded comment is not a comment'
00321           //    5 'A /* xx */ C'
00322           //
00323           // Illegal examples:
00324           //    1 'A \'B\''
00325           p++; // skip the initial '
00326           for(end++;*end!=0 && *end!='\'' ;++end);
00327         }
00328         else {
00329           // Any characters but w/s and comments.
00330           end = skip_to_whitespace(p);
00331         }
00332         char ec = *end;
00333         *end = 0;
00334         add_token(p);
00335         if (ec) {
00336           p = skip_whitespace(end+1);
00337         }
00338         else {
00339           *p = 0;
00340         }
00341       }
00342       break;
00343     }
00344   }
00345   return get_num_tokens();
00346 }
00347 
00348 bool simple_file_parser::get_next_line(size_t num_tokens,
00349                                        const string& tok1,
00350                                        const string& tok2,
00351                                        const string& tok3,
00352                                        const string& tok4,
00353                                        const string& tok5,
00354                                        const string& tok6,
00355                                        const string& tok7,
00356                                        const string& tok8)
00357 {
00358   size_t n = get_next_line();
00359   if (num_tokens>0 && num_tokens!=n)
00360     return false;
00361   if (num_tokens>=1 && tok1!="" && tok1!=m_tokens[0])
00362     return false;
00363   if (num_tokens>=2 && tok2!="" && tok2!=m_tokens[1])
00364     return false;
00365   if (num_tokens>=3 && tok3!="" && tok3!=m_tokens[2])
00366     return false;
00367   if (num_tokens>=4 && tok4!="" && tok4!=m_tokens[3])
00368     return false;
00369   if (num_tokens>=5 && tok5!="" && tok5!=m_tokens[4])
00370     return false;
00371   if (num_tokens>=6 && tok6!="" && tok6!=m_tokens[5])
00372     return false;
00373   if (num_tokens>=7 && tok7!="" && tok7!=m_tokens[6])
00374     return false;
00375   if (num_tokens>=8 && tok8!="" && tok8!=m_tokens[7])
00376     return false;
00377   return true;
00378 }
00379 
00380 bool simple_file_parser::is_token_uint(size_t i) const
00381 {
00382   const char* p = m_tokens[i].c_str();
00383   for(;*p;p++) {
00384     if (*p<'0' || *p>'9')
00385       return false;
00386   }
00387 
00388   return true;
00389 }
00390 
00391 bool simple_file_parser::is_token_int(size_t i) const
00392 {
00393   const char* p = m_tokens[i].c_str();
00394 
00395   // leading '-' is okay
00396   if (*p!='-' && (*p<'0' || *p>'9'))
00397     return false;
00398 
00399   for(++p;*p;p++) {
00400     if (*p<'0' || *p>'9')
00401       return false;
00402   }
00403 
00404   return true;
00405 }