[ Index ]

PHP Cross Reference of JPSpan 0.4 (beta)

title

Body

[close]

/JPSpan/ -> Lexer.php (source)

   1  <?php
   2  /**
   3  * Author Markus Baker: http://www.lastcraft.com
   4  * Version adapted from Simple Test: http://sourceforge.net/projects/simpletest/
   5  * @author Marcus Baker
   6  * @package JPSpan
   7  * @subpackage Lexer
   8  * @version $Id: Lexer.php,v 1.1 2004/11/10 16:04:47 harryf Exp $
   9  */
  10  /**#@+
  11   * lexer mode constant
  12   */
  13  define("JPSPAN_LEXER_ENTER", 1);
  14  define("JPSPAN_LEXER_MATCHED", 2);
  15  define("JPSPAN_LEXER_UNMATCHED", 3);
  16  define("JPSPAN_LEXER_EXIT", 4);
  17  define("JPSPAN_LEXER_SPECIAL", 5);
  18  /**#@-*/
  19  
  20  /**
  21   *    Compounded regular expression. Any of
  22   *    the contained patterns could match and
  23   *    when one does it's label is returned.
  24   *    @package JPSpan
  25   *    @subpackage Lexer
  26   */
  27  class JPSpan_LexerParallelRegex {
  28      var $_patterns;
  29      var $_labels;
  30      var $_regex;
  31      var $_case;
  32      
  33      /**
  34       *    Constructor. Starts with no patterns.
  35       *    @param boolean $case    True for case sensitive, false
  36       *                            for insensitive.
  37       *    @access public
  38       */
  39  	function JPSpan_LexerParallelRegex($case) {
  40          $this->_case = $case;
  41          $this->_patterns = array();
  42          $this->_labels = array();
  43          $this->_regex = null;
  44      }
  45      
  46      /**
  47       *    Adds a pattern with an optional label.
  48       *    @param string $pattern      Perl style regex, but ( and )
  49       *                                lose the usual meaning.
  50       *    @param string $label        Label of regex to be returned
  51       *                                on a match.
  52       *    @access public
  53       */
  54  	function addPattern($pattern, $label = true) {
  55          $count = count($this->_patterns);
  56          $this->_patterns[$count] = $pattern;
  57          $this->_labels[$count] = $label;
  58          $this->_regex = null;
  59      }
  60      
  61      /**
  62       *    Attempts to match all patterns at once against
  63       *    a string.
  64       *    @param string $subject      String to match against.
  65       *    @param string $match        First matched portion of
  66       *                                subject.
  67       *    @return boolean             True on success.
  68       *    @access public
  69       */
  70  	function match($subject, &$match) {
  71          if (count($this->_patterns) == 0) {
  72              return false;
  73          }
  74          if (! preg_match($this->_getCompoundedRegex(), $subject, $matches)) {
  75              $match = "";
  76              return false;
  77          }
  78          $match = $matches[0];
  79          for ($i = 1; $i < count($matches); $i++) {
  80              if ($matches[$i]) {
  81                  return $this->_labels[$i - 1];
  82              }
  83          }
  84          return true;
  85      }
  86      
  87      /**
  88       *    Compounds the patterns into a single
  89       *    regular expression separated with the
  90       *    "or" operator. Caches the regex.
  91       *    Will automatically escape (, ) and / tokens.
  92       *    @param array $patterns    List of patterns in order.
  93       *    @access private
  94       */
  95  	function _getCompoundedRegex() {
  96          if ($this->_regex == null) {
  97              for ($i = 0; $i < count($this->_patterns); $i++) {
  98                  $this->_patterns[$i] = '(' . str_replace(
  99                          array('/', '(', ')'),
 100                          array('\/', '\(', '\)'),
 101                          $this->_patterns[$i]) . ')';
 102              }
 103              $this->_regex = "/" . implode("|", $this->_patterns) . "/" . $this->_getPerlMatchingFlags();
 104          }
 105          return $this->_regex;
 106      }
 107      
 108      /**
 109       *    Accessor for perl regex mode flags to use.
 110       *    @return string       Perl regex flags.
 111       *    @access private
 112       */
 113  	function _getPerlMatchingFlags() {
 114          return ($this->_case ? "msS" : "msSi");
 115      }
 116  }
 117  
 118  /**
 119   *    States for a stack machine.
 120   *    @package JPSpan
 121   *    @subpackage Lexer
 122   */
 123  class JPSpan_LexerStateStack {
 124      var $_stack;
 125      
 126      /**
 127       *    Constructor. Starts in named state.
 128       *    @param string $start        Starting state name.
 129       *    @access public
 130       */
 131  	function JPSpan_LexerStateStack($start) {
 132          $this->_stack = array($start);
 133      }
 134      
 135      /**
 136       *    Accessor for current state.
 137       *    @return string       State.
 138       *    @access public
 139       */
 140  	function getCurrent() {
 141          return $this->_stack[count($this->_stack) - 1];
 142      }
 143      
 144      /**
 145       *    Adds a state to the stack and sets it
 146       *    to be the current state.
 147       *    @param string $state        New state.
 148       *    @access public
 149       */
 150  	function enter($state) {
 151          array_push($this->_stack, $state);
 152      }
 153      
 154      /**
 155       *    Leaves the current state and reverts
 156       *    to the previous one.
 157       *    @return boolean    False if we drop off
 158       *                       the bottom of the list.
 159       *    @access public
 160       */
 161  	function leave() {
 162          if (count($this->_stack) == 1) {
 163              return false;
 164          }
 165          array_pop($this->_stack);
 166          return true;
 167      }
 168  }
 169  
 170  /**
 171   *    Accepts text and breaks it into tokens.
 172   *    Some optimisation to make the sure the
 173   *    content is only scanned by the PHP regex
 174   *    parser once. Lexer modes must not start
 175   *    with leading underscores.
 176   *    @package JPSpan
 177   *    @subpackage Lexer
 178   */
 179  class JPSpan_Lexer {
 180      var $_regexes;
 181      var $_parser;
 182      var $_mode;
 183      var $_mode_handlers;
 184      var $_case;
 185      
 186      /**
 187       *    Sets up the lexer in case insensitive matching
 188       *    by default.
 189       *    @param JPSpan_Parser $parser  Handling strategy by
 190       *                                    reference.
 191       *    @param string $start            Starting handler.
 192       *    @param boolean $case            True for case sensitive.
 193       *    @access public
 194       */
 195  	function JPSpan_Lexer(&$parser, $start = "accept", $case = false) {
 196          $this->_case = $case;
 197          $this->_regexes = array();
 198          $this->_parser = &$parser;
 199          $this->_mode = &new JPSpan_LexerStateStack($start);
 200          $this->_mode_handlers = array();
 201      }
 202      
 203      /**
 204       *    Adds a token search pattern for a particular
 205       *    parsing mode. The pattern does not change the
 206       *    current mode.
 207       *    @param string $pattern      Perl style regex, but ( and )
 208       *                                lose the usual meaning.
 209       *    @param string $mode         Should only apply this
 210       *                                pattern when dealing with
 211       *                                this type of input.
 212       *    @access public
 213       */
 214  	function addPattern($pattern, $mode = "accept") {
 215          if (! isset($this->_regexes[$mode])) {
 216              $this->_regexes[$mode] = new JPSpan_LexerParallelRegex($this->_case);
 217          }
 218          $this->_regexes[$mode]->addPattern($pattern);
 219      }
 220      
 221      /**
 222       *    Adds a pattern that will enter a new parsing
 223       *    mode. Useful for entering parenthesis, strings,
 224       *    tags, etc.
 225       *    @param string $pattern      Perl style regex, but ( and )
 226       *                                lose the usual meaning.
 227       *    @param string $mode         Should only apply this
 228       *                                pattern when dealing with
 229       *                                this type of input.
 230       *    @param string $new_mode     Change parsing to this new
 231       *                                nested mode.
 232       *    @access public
 233       */
 234  	function addEntryPattern($pattern, $mode, $new_mode) {
 235          if (! isset($this->_regexes[$mode])) {
 236              $this->_regexes[$mode] = new JPSpan_LexerParallelRegex($this->_case);
 237          }
 238          $this->_regexes[$mode]->addPattern($pattern, $new_mode);
 239      }
 240      
 241      /**
 242       *    Adds a pattern that will exit the current mode
 243       *    and re-enter the previous one.
 244       *    @param string $pattern      Perl style regex, but ( and )
 245       *                                lose the usual meaning.
 246       *    @param string $mode         Mode to leave.
 247       *    @access public
 248       */
 249  	function addExitPattern($pattern, $mode) {
 250          if (! isset($this->_regexes[$mode])) {
 251              $this->_regexes[$mode] = new JPSpan_LexerParallelRegex($this->_case);
 252          }
 253          $this->_regexes[$mode]->addPattern($pattern, "__exit");
 254      }
 255      
 256      /**
 257       *    Adds a pattern that has a special mode. Acts as an entry
 258       *    and exit pattern in one go, effectively calling a special
 259       *    parser handler for this token only.
 260       *    @param string $pattern      Perl style regex, but ( and )
 261       *                                lose the usual meaning.
 262       *    @param string $mode         Should only apply this
 263       *                                pattern when dealing with
 264       *                                this type of input.
 265       *    @param string $special      Use this mode for this one token.
 266       *    @access public
 267       */
 268  	function addSpecialPattern($pattern, $mode, $special) {
 269          if (! isset($this->_regexes[$mode])) {
 270              $this->_regexes[$mode] = new JPSpan_LexerParallelRegex($this->_case);
 271          }
 272          $this->_regexes[$mode]->addPattern($pattern, "_$special");
 273      }
 274      
 275      /**
 276       *    Adds a mapping from a mode to another handler.
 277       *    @param string $mode        Mode to be remapped.
 278       *    @param string $handler     New target handler.
 279       *    @access public
 280       */
 281  	function mapHandler($mode, $handler) {
 282          $this->_mode_handlers[$mode] = $handler;
 283      }
 284      
 285      /**
 286       *    Splits the page text into tokens. Will fail
 287       *    if the handlers report an error or if no
 288       *    content is consumed. If successful then each
 289       *    unparsed and parsed token invokes a call to the
 290       *    held listener.
 291       *    @param string $raw        Raw HTML text.
 292       *    @return boolean           True on success, else false.
 293       *    @access public
 294       */
 295  	function parse($raw) {
 296          if (! isset($this->_parser)) {
 297              return false;
 298          }
 299          $length = strlen($raw);
 300          while (is_array($parsed = $this->_reduce($raw))) {
 301              list($unmatched, $matched, $mode) = $parsed;
 302              if (! $this->_dispatchTokens($unmatched, $matched, $mode)) {
 303                  return false;
 304              }
 305              if (strlen($raw) == $length) {
 306                  return false;
 307              }
 308              $length = strlen($raw);
 309          }
 310          if (!$parsed) {
 311              return false;
 312          }
 313          return $this->_invokeParser($raw, JPSPAN_LEXER_UNMATCHED);
 314      }
 315      
 316      /**
 317       *    Sends the matched token and any leading unmatched
 318       *    text to the parser changing the lexer to a new
 319       *    mode if one is listed.
 320       *    @param string $unmatched    Unmatched leading portion.
 321       *    @param string $matched      Actual token match.
 322       *    @param string $mode         Mode after match. A boolean
 323       *                                false mode causes no change.
 324       *    @return boolean             False if there was any error
 325       *                                from the parser.
 326       *    @access private
 327       */
 328  	function _dispatchTokens($unmatched, $matched, $mode = false) {
 329          if (! $this->_invokeParser($unmatched, JPSPAN_LEXER_UNMATCHED)) {
 330              return false;
 331          }
 332          if ($this->_isModeEnd($mode)) {
 333              if (! $this->_invokeParser($matched, JPSPAN_LEXER_EXIT)) {
 334                  return false;
 335              }
 336              return $this->_mode->leave();
 337          }
 338          if ($this->_isSpecialMode($mode)) {
 339              $this->_mode->enter($this->_decodeSpecial($mode));
 340              if (! $this->_invokeParser($matched, JPSPAN_LEXER_SPECIAL)) {
 341                  return false;
 342              }
 343              return $this->_mode->leave();
 344          }
 345          if (is_string($mode)) {
 346              $this->_mode->enter($mode);
 347              return $this->_invokeParser($matched, JPSPAN_LEXER_ENTER);
 348          }
 349          return $this->_invokeParser($matched, JPSPAN_LEXER_MATCHED);
 350      }
 351      
 352      /**
 353       *    Tests to see if the new mode is actually to leave
 354       *    the current mode and pop an item from the matching
 355       *    mode stack.
 356       *    @param string $mode    Mode to test.
 357       *    @return boolean        True if this is the exit mode.
 358       *    @access private
 359       */
 360  	function _isModeEnd($mode) {
 361          return ($mode === "__exit");
 362      }
 363      
 364      /**
 365       *    Test to see if the mode is one where this mode
 366       *    is entered for this token only and automatically
 367       *    leaves immediately afterwoods.
 368       *    @param string $mode    Mode to test.
 369       *    @return boolean        True if this is the exit mode.
 370       *    @access private
 371       */
 372  	function _isSpecialMode($mode) {
 373          return (strncmp($mode, "_", 1) == 0);
 374      }
 375      
 376      /**
 377       *    Strips the magic underscore marking single token
 378       *    modes.
 379       *    @param string $mode    Mode to decode.
 380       *    @return string         Underlying mode name.
 381       *    @access private
 382       */
 383  	function _decodeSpecial($mode) {
 384          return substr($mode, 1);
 385      }
 386      
 387      /**
 388       *    Calls the parser method named after the current
 389       *    mode. Empty content will be ignored. The lexer
 390       *    has a parser handler for each mode in the lexer.
 391       *    @param string $content        Text parsed.
 392       *    @param boolean $is_match      Token is recognised rather
 393       *                                  than unparsed data.
 394       *    @access private
 395       */
 396  	function _invokeParser($content, $is_match) {
 397          if (($content === "") || ($content === false)) {
 398              return true;
 399          }
 400          $handler = $this->_mode->getCurrent();
 401          if (isset($this->_mode_handlers[$handler])) {
 402              $handler = $this->_mode_handlers[$handler];
 403          }
 404          return $this->_parser->$handler($content, $is_match);
 405      }
 406      
 407      /**
 408       *    Tries to match a chunk of text and if successful
 409       *    removes the recognised chunk and any leading
 410       *    unparsed data. Empty strings will not be matched.
 411       *    @param string $raw         The subject to parse. This is the
 412       *                               content that will be eaten.
 413       *    @return array              Three item list of unparsed
 414       *                               content followed by the
 415       *                               recognised token and finally the
 416       *                               action the parser is to take.
 417       *                               True if no match, false if there
 418       *                               is a parsing error.
 419       *    @access private
 420       */
 421  	function _reduce(&$raw) {
 422          if (! isset($this->_regexes[$this->_mode->getCurrent()])) {
 423              return false;
 424          }
 425          if ($raw === "") {
 426              return true;
 427          }
 428          if ($action = $this->_regexes[$this->_mode->getCurrent()]->match($raw, $match)) {
 429              $unparsed_character_count = strpos($raw, $match);
 430              $unparsed = substr($raw, 0, $unparsed_character_count);
 431              $raw = substr($raw, $unparsed_character_count + strlen($match));
 432              return array($unparsed, $match, $action);
 433          }
 434          return true;
 435      }
 436  }
 437  ?>


Generated: Fri Nov 26 11:42:46 2004 Cross-referenced by PHPXref 0.6