From: <sg...@us...> - 2003-11-17 20:30:15
|
Update of /cvsroot/libfunutil/libfunutil/lib/s11n/parsers In directory sc8-pr-cvs1:/tmp/cvs-serv29404/lib/s11n/parsers Added Files: ns.compact.flex.at ns.flex_lexers.cpp ns.flex_lexers.h ns.funtxt.flex.at ns.funxml.flex.at ns.hex.flex.at ns.paren.flex.at ns.select_lexer.flex.at ns.simplexml.flex.at Log Message: egg --- NEW FILE: ns.compact.flex.at --- %option c++ %{ // // LICENSE: Public Domain // Author: stephan - st...@s1... // #define YY_SKIP_YYWRAP 1 int yywrap() { return 1; } // #include <stdio.h> #include <cassert> #include <iostream> #include <string> #include <deque> #include <S11N_NAMESPACE/s11n-macros.h> // COUT/CERR #define PCERR CERR << "compact.flex error:" // #include <S11N_NAMESPACE/string_util.h> // trim_string() // #include <S11N_NAMESPACE/property_store.h> // #include <S11N_NAMESPACE/class_loader.h> // #include <S11N_NAMESPACE/instantiator.h> // #include <S11N_NAMESPACE/key_value_parser.h> #include <S11N_NAMESPACE/node_builder.h> #include <S11N_NAMESPACE/flex_lexers.h> #include <S11N_NAMESPACE/string_util.h> // hex2int() using std::cin; using std::cerr; using std::cout; using std::endl; /** Basic grammar spec: {NODE_OPEN}{NAME_SIZE}{NODE_NAME}<class_name_size>{CLASSNAME} ({PROP_OPEN}<key_size><key><value_size><value>)* (sub-nodes)* {NODE_CLOSE} See the lex source for the meanings of the {TOKENS} named above. */ namespace { unsigned long node_depth = 0; unsigned int loops = 0; std::string word; std::string propname; std::string propval; std::string nodename; std::string nodeclass; bool in_prop; unsigned int decval = 0; unsigned int lcv = 0; S11N_NAMESPACE::node_builder * serbuilder = 0; } namespace S11N_NAMESPACE { FlexLexer * CompactTreeBuilder::lexer() { FlexLexer * fp = 0; fp = this->FlexTreeBuilder::lexer(); if( fp ) return fp; // else first-time setup: this->reset(); return this->FlexTreeBuilder::lexer(); } void CompactTreeBuilder::reset() { this->FlexTreeBuilder::reset(); FlexLexer * foo = new compactFlexLexer(); this->FlexTreeBuilder::lexer( foo ); serbuilder = this->builder(); node_depth = 0; } } // namespace S11N_NAMESPACE namespace { char inchar; } #define READWORD(SZ) word = ""; \ for( int i = 0; i < SZ; i++ )\ {\ inchar = yyinput(); \ if( 0 == inchar ) {word=""; PCERR << "Reached EOF during READWORD!" << endl; return 0;} \ word += inchar; \ };\ decval = S11N_NAMESPACE::hex2int(word) // if( 0 == decval ) { PCERR << "Error reading word of size " << SZ<<". Maybe reached end of input?" << endl; return 0; } %} HEX_DIGIT ([a-fA-F0-9]) WORD4 ({HEX_DIGIT}{4}) // maintenance note: these hex codes must be kept in sync with those from HexSerializer's enum NODE_OPEN f1 NODE_CLOSE f0 PROP_OPEN e1 COOKIE 51191001 DATA_END 51191000 %% {COOKIE} {;} {DATA_END} { return 0; } [ \t\n] {;} {NODE_OPEN} { //COUT << "Opening node." << std::endl; READWORD(2); // read node name size nodename = ""; loops = decval; for( lcv = 0; lcv < loops; lcv++ ) { //READWORD(2); //cout << "["<<word<<"/"<<decval<<"]"; nodename += yyinput(); // (unsigned char) decval; } //cout<< endl; READWORD(2); // get class name size nodeclass = ""; loops = decval; for( lcv = 0; lcv < loops; lcv++ ) { // read class name nodeclass += (unsigned char) yyinput(); // decval; } //COUT << "nodename=["<<nodename<<"]"<<"["<<nodeclass<<"]"<<endl; if( ! serbuilder->open_node( nodeclass, nodename ) ) { PCERR<< "open_node("<<nodeclass<<","<<nodename<<") failed." << endl; return 0; } nodename = nodeclass = ""; } {NODE_CLOSE} { //COUT << "Closing node." << std::endl; serbuilder->close_node(); if( 0 == serbuilder->node_depth() ) { // stop once we close the first top-level node. return 0; } continue; } {PROP_OPEN} { //COUTL( "Opening property" ); propname = ""; READWORD(2); // prop name size loops = decval; for( lcv = 0; lcv < loops; lcv++ ) { // read property name propname += (unsigned char) yyinput(); // decval; } READWORD(8); // get value size propval = ""; loops = decval; for( lcv = 0; lcv < loops; lcv++ ) { // read property's value propval += (unsigned char) yyinput(); // decval; } serbuilder->add_property( propname, propval ); propval = propname = ""; } [.] { PCERR << "unexpected token: " << YYText() <<std::endl; return 0; } %% #if COMPACT_DO_MAIN #include <S11N_NAMESPACE/s11n_io.h> // HexSerializer // #include <S11N_NAMESPACE/FlexShell.h> // #include <S11N_NAMESPACE/ELib.h> using namespace S11N_NAMESPACE; int main( int argc, char ** argv ) { S11N_NAMESPACE::CompactTreeBuilder bob; FlexLexer * lexer = bob.lexer(); // FlexLexer * lexer = new compactFlexLexer(); while( 0 != (lexer->yylex() ) ); if( bob.root_node() ) { S11N_NAMESPACE::ParenSerializer ser; ser.serialize( *(bob.root_node()), std::cout ); // S11N_NAMESPACE::CompactSerializer compact; // compact.serialize( *(bob.root_node()), std::cout ); } return 0; } #endif --- NEW FILE: ns.flex_lexers.cpp --- // Author: stephan beal <st...@s1...> // License: Public Domain #include "flex_lexers.h" #include <cassert> #include <S11N_NAMESPACE/s11n-macros.h> // COUT namespace S11N_NAMESPACE { FlexTreeBuilder::FlexTreeBuilder():m_lexer( 0 ), m_lexer_inited( false ), m_builder( 0 ) { } FlexTreeBuilder::~FlexTreeBuilder() { this->cleanup(); } FlexLexer *FlexTreeBuilder::lexer() { return this->m_lexer; } void FlexTreeBuilder::lexer( FlexLexer * newlexer ) { delete( this->m_lexer ); this->m_lexer = newlexer; } s11n_node * FlexTreeBuilder::root_node() { return this->m_builder ? this->m_builder->root_node() : NULL; } int FlexTreeBuilder::lexer_loop() { FlexLexer *l = this->lexer(); int ret; while ( 0 != ( ret = l->yylex() ) ) { } return ret; } void FlexTreeBuilder::reset() { delete( m_builder ); m_builder = new node_builder(); m_lexer_inited = true; } void FlexTreeBuilder::cleanup() { delete( m_builder ); m_builder = 0; m_lexer_inited = false; delete( this->m_lexer ); this->m_lexer = 0; } node_builder * FlexTreeBuilder::builder() { if ( !m_lexer_inited ) this->reset(); return m_builder; } bool FlexTreeBuilder::load( std::istream & in ) { if ( ! in.good() ) return false; FlexLexer *fl = this->lexer(); fl->switch_streams( &in ); int ret; ret = this->lexer_loop(); if ( 0 != ret ) { this->reset(); } return ( 0 == ret ); } } // namespace S11N_NAMESPACE --- NEW FILE: ns.flex_lexers.h --- #ifndef S11N_NAMESPACE_FLEX_LEXERS_H_INCLUDED #define S11N_NAMESPACE_FLEX_LEXERS_H_INCLUDED 1 // Author: stephan beal <st...@s1...> // License: Public Domain #include <string> #include <iostream> #ifndef FLEX_SCANNER #include <FlexLexer.h> #endif // #include <S11N_NAMESPACE/S11n.h> #include <S11N_NAMESPACE/node_builder.h> #define LEXER_LOUD 1 #if LEXER_LOUD # include <S11N_NAMESPACE/s11n-macros.h> // COUT/CERR #endif #define LOUT if(LEXER_LOUD) CERR #define lout if(LEXER_LOUD) cerr namespace S11N_NAMESPACE { /** FlexTreeBuilder declares an EXPERIMENTAL quasi-interface into the libfun/s11n-related flex parsers. It is meant to be included by the parsers and their clients. The ground rules are described in the FlexTreeBuilder struct, but the same notes apply to all the lexers which subclass it. Why all this hassle? Because getting multiple flex-based parsers into one library is more painful than it really ought to be, and the FlexLexer subclasses must be well-hidden from clients to avoid potential compilation problems. Additionally, loading data without knowing it's data type in advance is more trouble than it should be, largely due to the hassles involved with linking in multiple FlexLexer subclasses. Much of this code is implemented in flex_lexers.cpp, but some of it must be implemented by the specific lexers and lives elsewhere. Big Fat Warning: This interface is COMPLETELY not thread-safe! Much of the internal data used by the lexers is global-scope and multiple lexers, even of different types, should NOT be used in parallel! This can be considered a bug, and this limitation may be addressed (i.e., put off again) at some point. Usage notes: - You must never delete any lexer you fetch via lexer(): it is owned by the base class. - You should call FlexTreeBuilder::cleanup() when you're done so the tree builder can be deleted. Deleting a builder will clean it up, obviously. If you need to hold on to it's output you can call builder()->auto_delete(false) to keep it from deleting the parsed-in node. - Subclasses should assume that each input block contains exactly one root node, and stop parsing after processing one node. (This is purely a point of long-standing conventions.) Developer notes: - see the samples in fun{txt,xml}.flex.at, et al. todos: - investigate making this a subclass of node_builder. - the current subclasses duplicate way too much code. Find out how best to collapse that. */ class FlexTreeBuilder { // implementation lives in flex_lexers.cpp public: virtual ~ FlexTreeBuilder(); /** FlexLexer * lexer() Returns the FlexLexer parser set via lexer( FlexLexer * ), which subclasses should call to set the proper lexer. The default behaviour is to return NULL. This object is absolutely useless until the lexer() is set. The caller DOES NOT own the returned pointer. lexer implementors: you may want to do any one-time initialization of your lexer in the first call to lexer(). This is the only function which provides public access to your lexer, so doing any initialization here will ensure that it gets called before the lexer is actually used. Typically a call to reset() should be done here, as that creates a new internal tree builder (see builder(), below). By convention (the one set by FunXML and FunTxt, that is), a lexer should generally stop lexing after reading one DOM node. This assumes that, as in XML, there is one root node in any input. This model appears to work well for a wide variety of cases (the majority, i would say), so please don't let this minor limitation bother you (if it makes it any better, it's really up to the lexer what defines "a root node", and it could even artificially wrap it's output in an artificial root if it needed to). That said, the node_builder interface /should/ work with multiple root nodes, but it has not really been tested. After a lexer()->yylex() loop, if parsing got a root node it will be available via root_node(). */ virtual FlexLexer *lexer(); /** This function is simply a shortcut for builder()->root_node(). The default condition is that the caller does not own the returned pointer. The caller owns the given pointer if this->builder()->auto_delete() returns false, otherwise the pointer will be deleted when this object is cleaned up (via cleanup() or deletion). Typically clients should call this after a lexer().yylex() loop to fetch the object the parser collected. */ s11n_node * root_node(); /** Returns the current dom builder, which gets (or should get) populated by this object's lexer(). If it is null then no data has been parsed in. You can get much more info about the tree via this object, but it is rarely necessary to do so. The caller does not own the returned pointer. This function may return NULL. */ S11N_NAMESPACE::node_builder * builder(); /** Resets the parser to use a new builder. This deletes the object returned by builder(), so be careful not to hang on to that pointer too long. It is intended to be used by subclasses but may have uses in other contexts. */ virtual void reset(); /** Should be called after you are done lexing and retrieving the nodes so the lexer can free the builder (which also frees any loaded nodes). */ virtual void cleanup(); /** Convenience function: does a while(lexer()->yylex() != 0) loop and returns the last code returned by yylex(). If parsing gets a root node it will be available via root_node() or via the builder() object. */ int lexer_loop(); /** convenience function: parses the given input stream and stops parsing when this->lexer() is done with it. This normally means it stops after matching one input node. Note that the "one node rule" is conventional only: subclasses may choose to parse more than one node out of the input stream before returning. s11n convention, however, is to assume that any loaded data is contained in one root node. If the load fails then it is still possible that the builder() has some content, but it may be in an undefined state. */ bool load( std::istream & in ); /** Subclasses need to call this, passing their proper lexer type. It will delete any previously-assigned object. It may be useful for non-subclasses to call this, but it certainly only for unusual or particularly complex cases. */ void lexer( FlexLexer * lexer ); /** @deprecated This function is really no longer necessary: it has been obsoleted by S11N_NAMESPACE::node_loader's more flexible approach. It may still be useful in some specific contexts, but should be considered deprecated. load_builder() looks at the first line of the given file and tries to select an appropriate lexer for the token. It returns a pointer, perhaps NULL, which the caller takes responsibility for. If it finds a lexer but it finds no parseable tokens it will return NULL. If it finds a lexer it passes the file stream to the lexer, which will then populate the new builder object. If that fails then the builder is deleted and NULL is returned. Developer notes: This function WOULD take an istream, but i can't(?) reliably(?) read from it twice (once to get the magic cookie and once to feed the whole thing to the parser). Reading JUST the cookie, then passing the stream on to a different lexer for parsing screws up the stream (the bug is not yet localized). This function is implemented in select_lexer.flex.at. */ static FlexTreeBuilder *load_builder( const std::string & filename ); // /** // experimental way to generically support comment blocks in parsers. // */ // void comment_mode( bool b ) { this->m_comment_mode = b; }; // bool comment_mode() const { return this->m_comment_mode; }; protected: /** Only subclasses may instantiate objects of this class. */ FlexTreeBuilder(); private: FlexLexer * m_lexer; bool m_lexer_inited; // bool m_comment_mode; S11N_NAMESPACE::node_builder * m_builder; }; /** FunXML is for parsing what's known variably as "text", "serial-text" and "Rusty's" format. It's a simple-grammared text-based DOM which is easy for humans to read and hand-edit, as well as fairly easy to parse. This format is used by the QUB project (qub.sourceforge.net) and libFunUtil (libfunutil.sourceforge.net). */ class FunTxt:public FlexTreeBuilder { // implementation lives in funtxt.flex.at public: FunTxt(){} virtual ~FunTxt(){} /** Re-implemented for internal reasons. */ virtual FlexLexer *lexer(); /** Re-implemented for internal reasons. */ virtual void reset(); }; /** FunXML is for parsing what's sometimes affectionately known as "fun-xml format", which is a very limited dialect of XML used extensively by QUB (qub.sourceforge.net) and libFunUtil (libfunutil.sourceforge.net). Known "Potential Problems": - does only rudimentary entity translation: only enough to support the parser. */ class FunXML:public FlexTreeBuilder { // implementation lives in funxml.flex.at public: FunXML(){} virtual ~FunXML(){} /** Re-implemented for internal reasons. */ virtual FlexLexer *lexer(); /** Re-implemented for internal reasons. */ virtual void reset(); }; /** This flexer parses a simple XML dialect. This parser stores all XML attributes as properties, and objects with complex properties may very well result in unparseable data. Because s11n_node uses key=val properties to store all aribtrary data, and does not directly support the notion of CDATA, this class does some special handling of the "CDATA" property in deserialized nodes, and the SimpleXMLSerializer does the appropriate fiddling on the other end. i don't want to add a cdata-style type to the lower-level interface because it would only be there to accomodate XML, yet clients should not need to know they are working with XML. That said, however, XML-specific clients which use this class need to be aware that the CDATA is stored in the CDATA property of s11n_nodes which are deserialized by this class (e.g., available via node.get_string("CDATA")). This class does only rudimentaly XML entity translation, but it should suffice for simple cases where the data need not necessarily be 100% XML-compliant. */ class SimpleXMLFlexer:public FlexTreeBuilder { // implementation lives in simplexml.flex.at public: SimpleXMLFlexer(){} virtual ~ SimpleXMLFlexer(){} /** Re-implemented for internal reasons. */ virtual FlexLexer *lexer(); /** Re-implemented for internal reasons. */ virtual void reset(); }; /** A builder for "the parenthesis format" (see paren.flex.at). */ class ParenTreeBuilder:public FlexTreeBuilder { // implementation lives in paren.flex.at public: ParenTreeBuilder() {} virtual ~ ParenTreeBuilder(){} /** Re-implemented for internal reasons. */ virtual FlexLexer *lexer(); /** Re-implemented for internal reasons. */ virtual void reset(); }; /** HexTreeBuilder is used by the hexFlexLexer. */ class HexTreeBuilder:public FlexTreeBuilder { // implementation lives in hex.flex.at public: HexTreeBuilder(){} virtual ~ HexTreeBuilder() {} /** Re-implemented for internal reasons. */ virtual FlexLexer *lexer(); /** Re-implemented for internal reasons. */ virtual void reset(); }; /** CompactTreeBuilder is used by the compactFlexLexer. */ class CompactTreeBuilder:public FlexTreeBuilder { // implementation lives in compact.flex.at public: CompactTreeBuilder(){} virtual ~CompactTreeBuilder(){} /** Re-implemented for internal reasons. */ virtual FlexLexer *lexer(); /** Re-implemented for internal reasons. */ virtual void reset(); }; }; // namespace S11N_NAMESPACE #endif // S11N_NAMESPACE_FLEX_LEXERS_H_INCLUDED --- NEW FILE: ns.funtxt.flex.at --- %option c++ %{ namespace { // to get doxygen to ignore these comments :/ /** my first lex :) This code is for parsing Rusty Ballinger's "text mode serialization format" (as we call it). Here's a rough spec: nodename class=ClassName { property1_name property1_value property2_name "value with spaces" propertyN_name 'or single quotes' foo this \ line is split \ with backslashes. # comment lines nodename class=SomeClass { ... } } Extensions to the original format, probably not tolerated by older code and should probably be left out: - comment lines can start with ; or # or // - C++-style comment blocks are supported. Known problems: - If node_depth() is non-zero when this code starts parsing then it does not work properly at all. In practice this has never happened. - use proper lex STATES instead of assert_depth, if feasible. - add syntax extension: nodename class=SomeClass N where N is a number. That is, the node is replicated N times into the output. - add syntax extension: # nodename class=Foo will comment out the whole class block. */ }; // namespace #define YY_SKIP_YYWRAP 1 int yywrap() { return 1; } // #include <stdio.h> #include <cassert> #include <iostream> #include <string> #include <deque> #include <S11N_NAMESPACE/s11n-macros.h> // COUT/CERR #include <S11N_NAMESPACE/string_util.h> // trim_string() #include <S11N_NAMESPACE/property_store.h> #include <S11N_NAMESPACE/class_loader.h> #include <S11N_NAMESPACE/instantiator.h> #include <S11N_NAMESPACE/key_value_parser.h> #include <S11N_NAMESPACE/node_builder.h> #include <S11N_NAMESPACE/flex_lexers.h> #include <S11N_NAMESPACE/Serializer.h> using std::cin; using std::cerr; using std::cout; using std::endl; #define LEXCERR CERR namespace { unsigned long ignored; unsigned long bracedepth; S11N_NAMESPACE::node_builder * serbuilder; std::string nodename; std::string nodeclass; std::string property; bool escaped; static S11N_NAMESPACE::key_value_parser kvp; /** Runs key_value_parser::parse(str) and saves the result in the lexer's kvp object. If this function returns false then kvp is in an undefined state, otherwise kvp holds a key and value parsed from str. Accepted formats: key value value value ... key = value value value ... key todo: return a std::pair instead of using key_value_parser (it's usage is legacy stuff). */ bool parseKVP( const std::string & str ) { // todo: clean this shit up!!! // std::string val = str; std::string delim = " \t"; // check for an '='. These are a syntax extension, and // may cause incompatibilities when reading old-format // data which contains a '='. // if( std::string::npos != str.find( "=" ) ) delim = "="; std::string::size_type pos; pos = str.find_first_of( delim ); std::string key = str.substr( 0, pos ); S11N_NAMESPACE::trim_string( key ); kvp.key( key ); if( std::string::npos == pos ) { kvp.value( "" ); return true; } // pos = str.find_first_not_of( delim, pos ); // if( std::string::npos == pos ) // { // kvp.value( "" ); // return true; // } std::string val = str.substr( pos+1 ); // S11N_NAMESPACE::trim_string( ); static S11N_NAMESPACE::FunTxtSerializer ser; S11N_NAMESPACE::translate_entities( val, ser.translations(), true ); //CERR << "prop ["<<key<<"]=["<<val<<"]"<<std::endl; kvp.value( val ); return true; } }; #define assert_depth if( bracedepth != serbuilder->node_depth() ) { ++ignored; return 1; } namespace S11N_NAMESPACE { FlexLexer * FunTxt::lexer() { FlexLexer * fp = 0; fp = this->FlexTreeBuilder::lexer(); if( fp ) return fp; // else first-time setup: this->reset(); return this->FlexTreeBuilder::lexer(); } void FunTxt::reset() { this->FlexTreeBuilder::reset(); //this->lexer( new sertxtFlexLexer() ); // gcc 3.3 bitches about this.no matching function for call to `FunTxt::lexer(FlexLexer*&)' FlexLexer * foo = new sertxtFlexLexer(); this->FlexTreeBuilder::lexer( foo ); serbuilder = this->builder(); ignored = 0; bracedepth = 0; } } // namespace S11N_NAMESPACE %} @COMMON_DEFINITIONS@ // COMMON_DEFINITIONS: See common_flex_definitions.at %x READ_PROPERTY_VALUE %% "/*""*"* { // there is apparently an endless loop caused in some cases with this :/ // c++-style comments. Code mostly taken from the flex info pages. ++ignored; int c; while((c = yyinput()) != 0) { if(c == '*') { c = yyinput(); if( 0 == c || '/' == c ) break; } } return 1; } ({SPACE}*)([;#]|\/).* { ++ignored; return 1; /* single-line comment */ } ({SPACE}*){WORD}{SPACE}+{WORD}={CLASSNAME} { // nodename class=foo::Bar std::string foo = YYText(); S11N_NAMESPACE::trim_string( foo ); //LEXCERR << "class dec token=["<<foo<<"]"<<endl; nodename = foo.substr( 0, foo.find_first_of( " \t" ) ); nodeclass = foo.substr( foo.find( "=" ) + 1 ); int ret = serbuilder->open_node( nodeclass, nodename ) ? 1 : -1; if( (ret == -1) ) { LEXCERR << foo << endl; return 0; } //LEXCERR << bracedepth << " opening class node " << nodeclass << " : " << nodename << std::endl; return ret; } \{ { // node's opening brace ++bracedepth; // nothing. return 1; } \} { // node's closing brace if( bracedepth == serbuilder->node_depth() ) { // avoid closing node when open_node() fails //LEXCERR << bracedepth-1<<" closing node"<<std::endl; serbuilder->close_node(); } --bracedepth; if( 0 == serbuilder->node_depth() ) { // return once we close the first top-level node. //if( ignored ) LEXCERR << "lexer ignored " << ignored << " token"<<(ignored!=1?"s":"") << endl; return 0; } //LEXCERR << "node depth="<<serbuilder->node_depth()<<" brace_depth="<<bracedepth<<std::endl; } <READ_PROPERTY_VALUE>(\\\n)({SPACE})* { //property += " "; // YYText(); } <READ_PROPERTY_VALUE>. { property += YYText(); } <READ_PROPERTY_VALUE>^({SPACE})+ {;} // swallow it <READ_PROPERTY_VALUE>\n { // end of line //S11N_NAMESPACE::trim_string( property ); if( ! parseKVP( property ) ) { // wasn't a key/value pair, so we'll lazily assume it's a key with no value. kvp.key(property); kvp.value(""); //LEXCERR << "failed parsing key/value pair from property token ["<<property<<"]"<<endl; } std::string pval = kvp.value(); serbuilder->add_property( kvp.key(), pval ); //LEXCERR << "add_property(["<<kvp.key()<<"],["<<kvp.value()<<"])"<<std::endl; BEGIN INITIAL; } {PROPERTY_DECLS} { // property_name (=?) assert_depth; property = YYText(); BEGIN READ_PROPERTY_VALUE; } .|\n|{SPACE}+ {;} %% --- NEW FILE: ns.funxml.flex.at --- %option c++ %{ #define YY_SKIP_YYWRAP 1 int yywrap() { return 1; } #include <cassert> #include <iostream> #include <string> // #include <S11N_NAMESPACE/s11n-macros.h> // COUT/CERR #include <S11N_NAMESPACE/flex_lexers.h> #include <S11N_NAMESPACE/string_util.h> // translate_entities() #include <S11N_NAMESPACE/s11n_globals.h> // xml_entity_map() #include <S11N_NAMESPACE/Serializer.h> using std::cin; using std::cerr; using std::cout; using std::endl; // class FunXML_internal namespace { // public: unsigned long bracedepth; std::string nodename; std::string nodeclass; std::string yystr; std::string cdata; S11N_NAMESPACE::node_builder * the_builder; }; // struct FunXML_internal // FunXML_internal FunXMLi; namespace S11N_NAMESPACE { FlexLexer * FunXML::lexer() { FlexLexer * fp = 0; fp = this->FlexTreeBuilder::lexer(); if( fp ) return fp; // else first-time setup: this->reset(); return this->FlexTreeBuilder::lexer(); } void FunXML::reset() { this->FlexTreeBuilder::reset(); bracedepth = 0; this->FlexTreeBuilder::lexer(new serxmlFlexLexer()); the_builder = this->builder(); } } // namespace S11N_NAMESPACE %} @COMMON_DEFINITIONS@ // COMMON_DEFINITIONS: See common_flex_definitions.in KEY_TYPE ({WORD}|{NUMBER}) // NUMBER support is to help out s11n_node::deserialize_list/Map() %% \<{KEY_TYPE}{SPACE}+"class=\""{CLASSNAME}"\""{SPACE}*\> { // opening a node ++bracedepth; yystr = YYText(); //COUT << "class node? "<<yystr<<std::endl; std::string::size_type opos = yystr.find( "class=\"" ) + 7; std::string::size_type cpos = yystr.find( "\"", opos ); nodeclass = yystr.substr( opos, cpos - opos ); nodename = yystr.substr( 1, yystr.find_first_of( " \t\n" ) - 1 ); the_builder->open_node(nodeclass, nodename); continue; } \<{KEY_TYPE}\> { // opening a property cdata = ""; ++bracedepth; } \<\/{KEY_TYPE}\> { // closing something yystr = YYText(); if( bracedepth != the_builder->node_depth() ) { // closing a property. std::string prop = yystr.substr( 2, yystr.size() - 3 ); static S11N_NAMESPACE::FunXMLSerializer ser; S11N_NAMESPACE::translate_entities( cdata, ser.translations(), true ); the_builder->add_property( prop, cdata ); } else { // closing an object node. the_builder->close_node(); } --bracedepth; cdata = ""; if( 0 == the_builder->node_depth() ) { // return once we close the first top-level node. return 0; } } .|\n|({SPACE}) { cdata += YYText();} %% --- NEW FILE: ns.hex.flex.at --- %option c++ %{ // // LICENSE: Public Domain // Author: stephan - st...@s1... // #define YY_SKIP_YYWRAP 1 int yywrap() { return 1; } // #include <stdio.h> #include <cassert> #include <iostream> #include <string> #include <deque> #include <S11N_NAMESPACE/s11n-macros.h> // COUT/CERR #define PCERR CERR << "hex.flex error:" // #include <S11N_NAMESPACE/string_util.h> // trim_string() // #include <S11N_NAMESPACE/property_store.h> // #include <S11N_NAMESPACE/class_loader.h> // #include <S11N_NAMESPACE/instantiator.h> // #include <S11N_NAMESPACE/key_value_parser.h> #include <S11N_NAMESPACE/node_builder.h> #include <S11N_NAMESPACE/flex_lexers.h> #include <S11N_NAMESPACE/string_util.h> // hex2int() using std::cin; using std::cerr; using std::cout; using std::endl; /** Basic grammar spec for the "hexed" serialization format: {NODE_OPEN}{NAME_SIZE}{NODE_NAME}<class_name_size>{CLASSNAME} ({PROP_OPEN}<key_size><key><value_size><value>)* (sub-nodes)* {NODE_CLOSE} See the lex source for the meanings of the {TOKENS} named above. */ namespace { unsigned long node_depth = 0; unsigned int loops = 0; std::string word; std::string propname; std::string propval; std::string nodename; std::string nodeclass; bool in_prop; unsigned int decval = 0; unsigned int lcv = 0; S11N_NAMESPACE::node_builder * serbuilder = 0; } namespace S11N_NAMESPACE { FlexLexer * HexTreeBuilder::lexer() { FlexLexer * fp = 0; fp = this->FlexTreeBuilder::lexer(); if( fp ) return fp; // else first-time setup: this->reset(); return this->FlexTreeBuilder::lexer(); } void HexTreeBuilder::reset() { this->FlexTreeBuilder::reset(); FlexLexer * foo = new hexFlexLexer(); this->FlexTreeBuilder::lexer( foo ); serbuilder = this->builder(); node_depth = 0; } } // namespace S11N_NAMESPACE namespace { char inchar; } #define READWORD(SZ) word = ""; \ for( int i = 0; i < SZ; i++ )\ {\ inchar = yyinput(); \ if( 0 == inchar ) {word=""; return 0;} \ word += inchar; \ };\ decval = S11N_NAMESPACE::hex2int(word);\ if( 0 == decval ) { PCERR << "Error reading word (size="<<SZ<<"). Maybe reached end of input?" << endl; return 0; } %} HEX_DIGIT ([a-fA-F0-9]) WORD2 {HEX_DIGIT}{2} // maintenance note: these hex codes must be kept in sync with those from HexSerializer's enum NODE_OPEN 11 NODE_CLOSE 10 PROP_OPEN 21 COOKIE 51190001 DATA_END 51190000 %% \n {;} {COOKIE} {;} {DATA_END} { return 0; } {NODE_OPEN} { //COUT << "Opening node." << std::endl; READWORD(2); // read node name size nodename = ""; loops = decval; for( lcv = 0; lcv < loops; lcv++ ) { // read node name READWORD(2); // read next char of node name. nodename += (unsigned char) decval; } //cout<< endl; READWORD(2); // get class name size nodeclass = ""; loops = decval; for( lcv = 0; lcv < loops; lcv++ ) { // read class name READWORD(2); // get next char nodeclass += (unsigned char) decval; } if( ! serbuilder->open_node( nodeclass, nodename ) ) { PCERR<< "open_node("<<nodeclass<<","<<nodename<<") failed." << endl; return 0; } } {NODE_CLOSE} { //COUT << "Closing node." << std::endl; serbuilder->close_node(); if( 0 == serbuilder->node_depth() ) { // stop once we close the first top-level node. return 0; } } {PROP_OPEN} { //COUTL( "Opening property" ); READWORD(2); // prop name size //COUT << "name size=" <<word << " dec="<<decval<<std::endl; propname = ""; loops = decval; for( lcv = 0; lcv < loops; lcv++ ) { // read property naem READWORD(2); // next char propname += (unsigned char) decval; } READWORD(8); // get value size propval = ""; loops = decval; for( lcv = 0; lcv < loops; lcv++ ) { // read property's value READWORD(2); // next char propval += (unsigned char) decval; } serbuilder->add_property( propname, propval ); } {WORD2}|[.] { PCERR<< "unexpected token: " << YYText()<<std::endl; return 0; } %% #if HEX_DO_MAIN #include <S11N_NAMESPACE/s11n_io.h> // HexSerializer // #include <S11N_NAMESPACE/FlexShell.h> // #include <S11N_NAMESPACE/ELib.h> using namespace S11N_NAMESPACE; int main( int argc, char ** argv ) { S11N_NAMESPACE::HexTreeBuilder bob; FlexLexer * lexer = bob.lexer(); // FlexLexer * lexer = new hexFlexLexer(); while( 0 != (lexer->yylex() ) ); if( bob.root_node() ) { S11N_NAMESPACE::ParenSerializer ser; ser.serialize( *(bob.root_node()), std::cout ); } return 0; } #endif --- NEW FILE: ns.paren.flex.at --- %option c++ %{ /** LICENSE: Public Domain Author: stephan - st...@s1... This lexer reads in a lisp-like (but not lisp) grammar for the s11n framework. It's output partner is S11N_NAMESPACE::ParenSerializer. Sample: nodename=(ImplClassName (propery_name property value) (prop2 value of \) prop2) another_node=(ns::ClassName) ) nodename represents an s11n_node::name() ImplClassName represents the object's impl_class() value. Note that closing parens in your data must be backslash-escaped. This parser arguably strips all non-paired backslashes, so any actual backslashes must also be escaped (C-style). The ParensSerializer takes this into account and escapes it's serialized data. */ #define YY_SKIP_YYWRAP 1 int yywrap() { return 1; } // #include <stdio.h> #include <cassert> #include <iostream> #include <string> #include <deque> #include <S11N_NAMESPACE/s11n-macros.h> // COUT/CERR #define PCERR CERR << "paren.flex error:" #include <S11N_NAMESPACE/string_util.h> // trim_string() // #include <S11N_NAMESPACE/property_store.h> // #include <S11N_NAMESPACE/class_loader.h> // #include <S11N_NAMESPACE/instantiator.h> // #include <S11N_NAMESPACE/key_value_parser.h> #include <S11N_NAMESPACE/node_builder.h> #include <S11N_NAMESPACE/flex_lexers.h> #include <S11N_NAMESPACE/Serializer.h> // for ParenSerializer::translations() using std::cin; using std::cerr; using std::cout; using std::endl; namespace { unsigned long node_depth = 0; std::string tmpstr; std::string nodename; std::string nodeclass; bool in_prop; S11N_NAMESPACE::node_builder * serbuilder = 0; } namespace S11N_NAMESPACE { FlexLexer * ParenTreeBuilder::lexer() { FlexLexer * fp = 0; fp = this->FlexTreeBuilder::lexer(); if( fp ) return fp; // else first-time setup: this->reset(); return this->FlexTreeBuilder::lexer(); } void ParenTreeBuilder::reset() { this->FlexTreeBuilder::reset(); FlexLexer * foo = new parenFlexLexer(); this->FlexTreeBuilder::lexer( foo ); serbuilder = this->builder(); node_depth = 0; } } // namespace S11N_NAMESPACE /***** *****/ %} @COMMON_DEFINITIONS@ %x OPEN_CLASS %x IN_PROPERTY OPENER \( CLOSER \) NODENAME {VARNAME} PROPERTY {VARNAME} %% {SPACE}*[#;].*$ {;} // comment lines "(*" { // (* comment blocks *) // Code mostly taken from the flex info pages. int c; while((c = yyinput()) != 0) { if(c == '*') { c = yyinput(); if( 0 == c ) { PCERR << "hit EOF in a (*comment block*)." << std::endl; return 0; } if( ')' == c ) break; //??? else unput(c); } } return 1; } {OPENER}{SPACE}* { BEGIN IN_PROPERTY; } <IN_PROPERTY>({VARNAME}|{NUMBER})({SPACE})* { // key name of property std::string propname = YYText(); // strip leading/trailing spaces from the property name: static const std::string avoid = " \t\n"; std::string::size_type tail = propname.find_last_not_of( avoid ); propname = propname.substr( propname.find_first_not_of( avoid ), (std::string::npos == tail) ? tail : (tail + 1) ); //COUT << "property ["<<propname<<"] = "; // Now we consider all data until a non-escaped closing brace // to be the value of the property... std::string propval = ""; unsigned char c = yyinput(); bool escaped = false; while( 0 != c ) { if( (!escaped) && ('\\' == c) ) { // next char will be considered escaped escaped = true; propval += c; c = yyinput(); continue; } if ( (!escaped) && (')' == c) ) { // Look for a non-escaped paren to close us. break; } escaped = false; propval += c; c = yyinput(); //COUT << "["<<c<<"]"<<endl; } if( 0 == c ) { PCERR << "Reached EOF while reading value for property '"<<propname<<"'. This shouldn't happen." << std::endl; return 0; } // S11N_NAMESPACE::trim_string( propval, S11N_NAMESPACE::TrimLeading ); static S11N_NAMESPACE::ParenSerializer ser; S11N_NAMESPACE::translate_entities( propval, ser.translations(), true ); //std::cout << "["<<propval<<"]"<<std::endl; serbuilder->add_property( propname, propval ); BEGIN 0; } {NODENAME}={OPENER} { tmpstr = YYText(); nodename = tmpstr.substr( 0, tmpstr.find( "=" ) ); //COUT << "node name="<<nodename<<std::endl; BEGIN OPEN_CLASS; } <OPEN_CLASS>{SPACE}+ {;} <OPEN_CLASS>{CLOSER} { // special case: empty node: nodename=() --node_depth; //COUT << "ignoring empty node " << nodename << std::endl; BEGIN 0; } <OPEN_CLASS>[{SPACE}\n]+ {;} <OPEN_CLASS>{CLASSNAME} { nodeclass = std::string(YYText()); if( nodeclass.empty() ) { PCERR << "nodes may not have empty names!" <<std::endl; return 0; } //COUT << "opening '"<<nodename<<"', class=" << nodeclass << std::endl; ++node_depth; if( ! serbuilder->open_node( nodeclass, nodename ) ) { PCERR<< "open_node("<<nodeclass<<","<<nodename<<") failed." << endl; return 0; } BEGIN 0; } <OPEN_CLASS>[.\n] { PCERR << "did not find class name after '"<<nodename<<"=(" << std::endl; return 0; } {CLOSER} { //COUT << node_depth<< " Closing node" << std::endl; serbuilder->close_node(); --node_depth; if( 0 == serbuilder->node_depth() ) { // stop once we close the first top-level node. return 0; } } ^"(S11N_NAMESPACE::parens)" {;} // magic cookie ({SPACE})|\n {;} . {;} // [.]+ { PCERR << "Unexpected token: [" << YYText()<<"]"<<std::endl;} %% // ^{SPACE}*[#;].*$ {;} // comment lines // [#;][^{OPENER}]* {;} // comments trailing after node closures #if PAREN_DO_MAIN #include <S11N_NAMESPACE/s11n_io.h> // ParenSerializer int main( int argc, char ** argv ) { S11N_NAMESPACE::ParenTreeBuilder bob; FlexLexer * lexer = bob.lexer(); while( 0 != (lexer->yylex() ) ); if( bob.root_node() ) { S11N_NAMESPACE::ParenSerializer ser; ser.serialize( *(bob.root_node()), std::cout ); } return 0; } #endif --- NEW FILE: ns.select_lexer.flex.at --- %option c++ %{ /** LICENSE: Public Domain Searchs the first line of input (consuming it) for a "magic cookie" by which to identify a lexer. Maintenance: This code knows about the formats supplied with the s11n library, so it must be updated as those formats change. */ #define YY_SKIP_YYWRAP 1 int yywrap() { return 1; } #include <cassert> #include <iostream> #include <string> #include <sstream> #include <S11N_NAMESPACE/s11n-macros.h> // COUT/CERR // #include <S11N_NAMESPACE/node_builder.h> #include <S11N_NAMESPACE/flex_lexers.h> #include <S11N_NAMESPACE/file_util.h> // get_istream(), bytes_from_file() using std::cin; using std::cerr; using std::cout; using std::endl; namespace S11N_NAMESPACE { FlexTreeBuilder * m_selected_builder = 0; FlexTreeBuilder * FlexTreeBuilder::load_builder( const std::string & fname ) { m_selected_builder = 0; std::string token = S11N_NAMESPACE::bytes_from_file( fname, 50 ); if( token.empty() ) return NULL; //CERR << "cookie=["<<token<<"]"<<std::endl; std::istringstream cookiestream(token); select_lexerFlexLexer fl; fl.switch_streams( &cookiestream, 0 ); fl.yylex(); // only one iteration is needed unless the lexer code changes significantly. if( ! m_selected_builder ) { CERR << "FlexTreeBuilder::load_builder("<<fname<<") does not know how to handle this file." << std::endl; return NULL; } std::istream * is = S11N_NAMESPACE::get_istream( fname.c_str() ); if( ! is ) return NULL; // should never happen, since we were able to read the cookie. bool killit = false; FlexTreeBuilder * ret = m_selected_builder; if( ! ret->load( *is ) ) { killit = true; } delete( is ); if( ! m_selected_builder->root_node() ) { // no point, is there? killit = true; } if( killit ) { delete( m_selected_builder ); m_selected_builder = 0; ret = 0; } return ret; } }; %} @COMMON_DEFINITIONS@ // COMMON_DEFINITIONS: See common_flex_definitions.in %% ^"<!DOCTYPE SerialTree>" { // fun-xml S11N_NAMESPACE::m_selected_builder = new S11N_NAMESPACE::FunXML(); return 0; } ^"#SerialTree 1" { // fun-txt S11N_NAMESPACE::m_selected_builder = new S11N_NAMESPACE::FunTxt(); return 0; } ^"<!DOCTYPE S11N_NAMESPACE::simplexml>" { // s11n S11N_NAMESPACE::m_selected_builder = new S11N_NAMESPACE::SimpleXMLFlexer(); return 0; } ^"(S11N_NAMESPACE::parens)" { S11N_NAMESPACE::m_selected_builder = new S11N_NAMESPACE::ParenTreeBuilder(); return 0; } /** ^51190001 { S11N_NAMESPACE::m_selected_builder = new S11N_NAMESPACE::HexTreeBuilder(); return 0; } */ ^51191001 { S11N_NAMESPACE::m_selected_builder = new S11N_NAMESPACE::CompactTreeBuilder(); return 0; } ^"#!/S11N_NAMESPACE/TreeBuilder".* { // reminder: if i add a $ to the end is never matches! bug? CERR << "Future-compatibility file format:" << YYText() << std::endl; return 0; } (.|\n)+ { //CERR << "Eeek: " << YYText() << std::endl; S11N_NAMESPACE::m_selected_builder = NULL; return 0; } %% #if SELECT_LEXER_DO_MAIN int main() { FlexLexer * fl = new select_lexerFlexLexer(); while( fl->yylex() ); delete( fl ); } #endif --- NEW FILE: ns.simplexml.flex.at --- %option c++ %{ /** This flexer builds s11n_node trees out of a subset of XML. Most basic XML constructs are supported. XML CDATA, which is not directly supported by the s11n_node interface, is stored in the property named "CDATA", available via s11n_node::get_string("CDATA"). */ #define YY_SKIP_YYWRAP 1 int yywrap() { return 1; } #include <cassert> #include <iostream> #include <string> #include <stack> #include <S11N_NAMESPACE/string_util.h> // normalize_string(), xml_entity_map(), etc #include <S11N_NAMESPACE/flex_lexers.h> #include <S11N_NAMESPACE/Serializer.h> #include <S11N_NAMESPACE/key_value_parser.h> #include <S11N_NAMESPACE/context_singleton.h> using std::cin; using std::cerr; using std::cout; using std::endl; namespace { unsigned long bracedepth; std::string nodename; std::string nodeclass; std::string yystr; S11N_NAMESPACE::node_builder * the_builder; typedef std::stack<std::string> StringStack; StringStack cdata; void cleanup() { nodename = ""; nodeclass = ""; yystr = ""; for( StringStack::size_type i = 0; i < cdata.size(); ++i ) cdata.pop(); // the_builder will be freed elsewhere. } }; namespace S11N_NAMESPACE { FlexLexer * SimpleXMLFlexer::lexer() { FlexLexer * fp = 0; fp = this->FlexTreeBuilder::lexer(); if( fp ) return fp; // else first-time setup: this->reset(); return this->FlexTreeBuilder::lexer(); } void SimpleXMLFlexer::reset() { this->FlexTreeBuilder::reset(); bracedepth = 0; FlexLexer * fp = new simplexmlFlexLexer(); this->FlexTreeBuilder::lexer(fp); fp->set_debug( 1 ); the_builder = this->builder(); } } // namespace S11N_NAMESPACE static S11N_NAMESPACE::key_value_parser sxml_kvp; int sxml_close_node() { // closing something if( bracedepth == the_builder->node_depth() ) { const std::string & cd = cdata.top(); // S11N_NAMESPACE::trim_string( cd ); if( ! cd.empty() ) the_builder->add_property( "CDATA", cd ); the_builder->close_node(); } if( 0 == the_builder->node_depth() ) { // return once we close the first top-level node. cleanup(); return 0; } cdata.pop(); --bracedepth; return the_builder->node_depth(); } %} @COMMON_DEFINITIONS@ // COMMON_DEFINITIONS: See common_flex_definitions.in KEY_TYPE ({WORD}|{CLASSNAME}|{NUMBER}) // NUMBER is to help out s11n_node::deserialize_list/Map() %x IN_ELEM_DECL %x IN_COMMENT %x IN_CDATA %% // \<\!.+\n { // continue; // } "<?"[^>]+ {;} // todo: add a state to read through these! "<![CDATA[" { BEGIN IN_CDATA; } <IN_CDATA>"]]>" { BEGIN 0; } <INITIAL>^({SPACE}+) {continue;} <INITIAL>({SPACE}+)$ {continue;} <IN_CDATA>(.|\n) { if( ! cdata.empty() ) cdata.top() += YYText(); continue; } "<"{KEY_TYPE} { yystr = YYText(); //COUT << "opening element? "<<yystr<<std::endl; nodename = yystr.substr( 1 ); the_builder->open_node("NoClassYet", nodename); cdata.push(std::string()); ++bracedepth; BEGIN IN_ELEM_DECL; } <IN_ELEM_DECL>{KEY_TYPE}=({QUOTED_STRING}|{RGB_COLOR}|{KEY_TYPE}|([a-zA-Z_0-9]+)|(\"\")) { // foo if( ! sxml_kvp.parse( YYText() ) ) { CERR << "syntax error: could not parse key=value from ["<<YYText()<<"]"<<std::endl; cleanup(); return 0; } std::string attr = sxml_kvp.value(); if( attr.size() >= 2 ) { // strip quotes attr.erase(attr.begin()); attr.erase(attr.end()-1); } S11N_NAMESPACE::translate_entities( attr, S11N_NAMESPACE::context_singleton<S11N_NAMESPACE::SimpleXMLSerializer,void>::instance().translations(), true ); std::string k = sxml_kvp.key(); if( "s11n_class" == k ) { the_builder->current_node()->impl_class( attr ); } else { the_builder->add_property( k, attr ); } //CERR << "property val=["<<attr<<"] --> ["<<propval<<"]"<<std::endl; } <IN_ELEM_DECL>\/{SPACE}*> { if( 0 == sxml_close_node() ) return 0; BEGIN 0; } <IN_ELEM_DECL>">" { // closing element decl. //yystr = YYText(); BEGIN 0; } <IN_ELEM_DECL>"<" { CERR << "syntax error: we found a '<' character inside an element declaration." << std::endl; cleanup(); return 0; ; } <IN_ELEM_DECL>(.|\n) {;} \<\/({KEY_TYPE})({SPACE}*)\> { if( 0 == sxml_close_node() ) return 0; } "<!--" { //COUT << "entering comment..." << std::endl; BEGIN IN_COMMENT; } <IN_COMMENT>"<!--" { CERR << "syntax error: you may not have comments within comments." << std::endl; cleanup(); return 0; //BEGIN 0; } <IN_COMMENT>"-->" { //COUT << "... exiting comment" << std::endl; BEGIN 0; } <IN_COMMENT>[.\n] {;} .|\n {;} // swallow it %% #if SIMPLEXML_DO_MAIN #include <S11N_NAMESPACE/s11n_io.h> using namespace S11N_NAMESPACE; int main( int argc, char ** argv ) { SimpleXMLFlexer xmlflex; FlexLexer * lexer = xmlflex.lexer(); while( 0 != (lexer->yylex() ) ); std::cout << endl; if( s11n_node * node = xmlflex.root_node() ) { node->serialize_properties( *node ); COUT << "Re-serialized:\n"; s11n_io<SimpleXMLSerializer>::save( *node, std::cout ); s11n_io<FunTxtSerializer>::save( *node, std::cout ); s11n_io<FunXMLSerializer>::save( *node, std::cout ); } return 0; } #endif |