| [ Index ] |
PHP Cross Reference of JPSpan 0.4 (beta) |
[Summary view] [Print] [Text view]
1 <?php 2 /** 3 * Author Markus Baker: http://www.lastcraft.com 4 * Version adapted from Simple Test: http://sourceforge.net/projects/simpletest/ 5 * @author Marcus Baker 6 * @package JPSpan 7 * @subpackage Lexer 8 * @version $Id: Lexer.php,v 1.1 2004/11/10 16:04:47 harryf Exp $ 9 */ 10 /**#@+ 11 * lexer mode constant 12 */ 13 define("JPSPAN_LEXER_ENTER", 1); 14 define("JPSPAN_LEXER_MATCHED", 2); 15 define("JPSPAN_LEXER_UNMATCHED", 3); 16 define("JPSPAN_LEXER_EXIT", 4); 17 define("JPSPAN_LEXER_SPECIAL", 5); 18 /**#@-*/ 19 20 /** 21 * Compounded regular expression. Any of 22 * the contained patterns could match and 23 * when one does it's label is returned. 24 * @package JPSpan 25 * @subpackage Lexer 26 */ 27 class JPSpan_LexerParallelRegex { 28 var $_patterns; 29 var $_labels; 30 var $_regex; 31 var $_case; 32 33 /** 34 * Constructor. Starts with no patterns. 35 * @param boolean $case True for case sensitive, false 36 * for insensitive. 37 * @access public 38 */ 39 function JPSpan_LexerParallelRegex($case) { 40 $this->_case = $case; 41 $this->_patterns = array(); 42 $this->_labels = array(); 43 $this->_regex = null; 44 } 45 46 /** 47 * Adds a pattern with an optional label. 48 * @param string $pattern Perl style regex, but ( and ) 49 * lose the usual meaning. 50 * @param string $label Label of regex to be returned 51 * on a match. 52 * @access public 53 */ 54 function addPattern($pattern, $label = true) { 55 $count = count($this->_patterns); 56 $this->_patterns[$count] = $pattern; 57 $this->_labels[$count] = $label; 58 $this->_regex = null; 59 } 60 61 /** 62 * Attempts to match all patterns at once against 63 * a string. 64 * @param string $subject String to match against. 65 * @param string $match First matched portion of 66 * subject. 67 * @return boolean True on success. 68 * @access public 69 */ 70 function match($subject, &$match) { 71 if (count($this->_patterns) == 0) { 72 return false; 73 } 74 if (! preg_match($this->_getCompoundedRegex(), $subject, $matches)) { 75 $match = ""; 76 return false; 77 } 78 $match = $matches[0]; 79 for ($i = 1; $i < count($matches); $i++) { 80 if ($matches[$i]) { 81 return $this->_labels[$i - 1]; 82 } 83 } 84 return true; 85 } 86 87 /** 88 * Compounds the patterns into a single 89 * regular expression separated with the 90 * "or" operator. Caches the regex. 91 * Will automatically escape (, ) and / tokens. 92 * @param array $patterns List of patterns in order. 93 * @access private 94 */ 95 function _getCompoundedRegex() { 96 if ($this->_regex == null) { 97 for ($i = 0; $i < count($this->_patterns); $i++) { 98 $this->_patterns[$i] = '(' . str_replace( 99 array('/', '(', ')'), 100 array('\/', '\(', '\)'), 101 $this->_patterns[$i]) . ')'; 102 } 103 $this->_regex = "/" . implode("|", $this->_patterns) . "/" . $this->_getPerlMatchingFlags(); 104 } 105 return $this->_regex; 106 } 107 108 /** 109 * Accessor for perl regex mode flags to use. 110 * @return string Perl regex flags. 111 * @access private 112 */ 113 function _getPerlMatchingFlags() { 114 return ($this->_case ? "msS" : "msSi"); 115 } 116 } 117 118 /** 119 * States for a stack machine. 120 * @package JPSpan 121 * @subpackage Lexer 122 */ 123 class JPSpan_LexerStateStack { 124 var $_stack; 125 126 /** 127 * Constructor. Starts in named state. 128 * @param string $start Starting state name. 129 * @access public 130 */ 131 function JPSpan_LexerStateStack($start) { 132 $this->_stack = array($start); 133 } 134 135 /** 136 * Accessor for current state. 137 * @return string State. 138 * @access public 139 */ 140 function getCurrent() { 141 return $this->_stack[count($this->_stack) - 1]; 142 } 143 144 /** 145 * Adds a state to the stack and sets it 146 * to be the current state. 147 * @param string $state New state. 148 * @access public 149 */ 150 function enter($state) { 151 array_push($this->_stack, $state); 152 } 153 154 /** 155 * Leaves the current state and reverts 156 * to the previous one. 157 * @return boolean False if we drop off 158 * the bottom of the list. 159 * @access public 160 */ 161 function leave() { 162 if (count($this->_stack) == 1) { 163 return false; 164 } 165 array_pop($this->_stack); 166 return true; 167 } 168 } 169 170 /** 171 * Accepts text and breaks it into tokens. 172 * Some optimisation to make the sure the 173 * content is only scanned by the PHP regex 174 * parser once. Lexer modes must not start 175 * with leading underscores. 176 * @package JPSpan 177 * @subpackage Lexer 178 */ 179 class JPSpan_Lexer { 180 var $_regexes; 181 var $_parser; 182 var $_mode; 183 var $_mode_handlers; 184 var $_case; 185 186 /** 187 * Sets up the lexer in case insensitive matching 188 * by default. 189 * @param JPSpan_Parser $parser Handling strategy by 190 * reference. 191 * @param string $start Starting handler. 192 * @param boolean $case True for case sensitive. 193 * @access public 194 */ 195 function JPSpan_Lexer(&$parser, $start = "accept", $case = false) { 196 $this->_case = $case; 197 $this->_regexes = array(); 198 $this->_parser = &$parser; 199 $this->_mode = &new JPSpan_LexerStateStack($start); 200 $this->_mode_handlers = array(); 201 } 202 203 /** 204 * Adds a token search pattern for a particular 205 * parsing mode. The pattern does not change the 206 * current mode. 207 * @param string $pattern Perl style regex, but ( and ) 208 * lose the usual meaning. 209 * @param string $mode Should only apply this 210 * pattern when dealing with 211 * this type of input. 212 * @access public 213 */ 214 function addPattern($pattern, $mode = "accept") { 215 if (! isset($this->_regexes[$mode])) { 216 $this->_regexes[$mode] = new JPSpan_LexerParallelRegex($this->_case); 217 } 218 $this->_regexes[$mode]->addPattern($pattern); 219 } 220 221 /** 222 * Adds a pattern that will enter a new parsing 223 * mode. Useful for entering parenthesis, strings, 224 * tags, etc. 225 * @param string $pattern Perl style regex, but ( and ) 226 * lose the usual meaning. 227 * @param string $mode Should only apply this 228 * pattern when dealing with 229 * this type of input. 230 * @param string $new_mode Change parsing to this new 231 * nested mode. 232 * @access public 233 */ 234 function addEntryPattern($pattern, $mode, $new_mode) { 235 if (! isset($this->_regexes[$mode])) { 236 $this->_regexes[$mode] = new JPSpan_LexerParallelRegex($this->_case); 237 } 238 $this->_regexes[$mode]->addPattern($pattern, $new_mode); 239 } 240 241 /** 242 * Adds a pattern that will exit the current mode 243 * and re-enter the previous one. 244 * @param string $pattern Perl style regex, but ( and ) 245 * lose the usual meaning. 246 * @param string $mode Mode to leave. 247 * @access public 248 */ 249 function addExitPattern($pattern, $mode) { 250 if (! isset($this->_regexes[$mode])) { 251 $this->_regexes[$mode] = new JPSpan_LexerParallelRegex($this->_case); 252 } 253 $this->_regexes[$mode]->addPattern($pattern, "__exit"); 254 } 255 256 /** 257 * Adds a pattern that has a special mode. Acts as an entry 258 * and exit pattern in one go, effectively calling a special 259 * parser handler for this token only. 260 * @param string $pattern Perl style regex, but ( and ) 261 * lose the usual meaning. 262 * @param string $mode Should only apply this 263 * pattern when dealing with 264 * this type of input. 265 * @param string $special Use this mode for this one token. 266 * @access public 267 */ 268 function addSpecialPattern($pattern, $mode, $special) { 269 if (! isset($this->_regexes[$mode])) { 270 $this->_regexes[$mode] = new JPSpan_LexerParallelRegex($this->_case); 271 } 272 $this->_regexes[$mode]->addPattern($pattern, "_$special"); 273 } 274 275 /** 276 * Adds a mapping from a mode to another handler. 277 * @param string $mode Mode to be remapped. 278 * @param string $handler New target handler. 279 * @access public 280 */ 281 function mapHandler($mode, $handler) { 282 $this->_mode_handlers[$mode] = $handler; 283 } 284 285 /** 286 * Splits the page text into tokens. Will fail 287 * if the handlers report an error or if no 288 * content is consumed. If successful then each 289 * unparsed and parsed token invokes a call to the 290 * held listener. 291 * @param string $raw Raw HTML text. 292 * @return boolean True on success, else false. 293 * @access public 294 */ 295 function parse($raw) { 296 if (! isset($this->_parser)) { 297 return false; 298 } 299 $length = strlen($raw); 300 while (is_array($parsed = $this->_reduce($raw))) { 301 list($unmatched, $matched, $mode) = $parsed; 302 if (! $this->_dispatchTokens($unmatched, $matched, $mode)) { 303 return false; 304 } 305 if (strlen($raw) == $length) { 306 return false; 307 } 308 $length = strlen($raw); 309 } 310 if (!$parsed) { 311 return false; 312 } 313 return $this->_invokeParser($raw, JPSPAN_LEXER_UNMATCHED); 314 } 315 316 /** 317 * Sends the matched token and any leading unmatched 318 * text to the parser changing the lexer to a new 319 * mode if one is listed. 320 * @param string $unmatched Unmatched leading portion. 321 * @param string $matched Actual token match. 322 * @param string $mode Mode after match. A boolean 323 * false mode causes no change. 324 * @return boolean False if there was any error 325 * from the parser. 326 * @access private 327 */ 328 function _dispatchTokens($unmatched, $matched, $mode = false) { 329 if (! $this->_invokeParser($unmatched, JPSPAN_LEXER_UNMATCHED)) { 330 return false; 331 } 332 if ($this->_isModeEnd($mode)) { 333 if (! $this->_invokeParser($matched, JPSPAN_LEXER_EXIT)) { 334 return false; 335 } 336 return $this->_mode->leave(); 337 } 338 if ($this->_isSpecialMode($mode)) { 339 $this->_mode->enter($this->_decodeSpecial($mode)); 340 if (! $this->_invokeParser($matched, JPSPAN_LEXER_SPECIAL)) { 341 return false; 342 } 343 return $this->_mode->leave(); 344 } 345 if (is_string($mode)) { 346 $this->_mode->enter($mode); 347 return $this->_invokeParser($matched, JPSPAN_LEXER_ENTER); 348 } 349 return $this->_invokeParser($matched, JPSPAN_LEXER_MATCHED); 350 } 351 352 /** 353 * Tests to see if the new mode is actually to leave 354 * the current mode and pop an item from the matching 355 * mode stack. 356 * @param string $mode Mode to test. 357 * @return boolean True if this is the exit mode. 358 * @access private 359 */ 360 function _isModeEnd($mode) { 361 return ($mode === "__exit"); 362 } 363 364 /** 365 * Test to see if the mode is one where this mode 366 * is entered for this token only and automatically 367 * leaves immediately afterwoods. 368 * @param string $mode Mode to test. 369 * @return boolean True if this is the exit mode. 370 * @access private 371 */ 372 function _isSpecialMode($mode) { 373 return (strncmp($mode, "_", 1) == 0); 374 } 375 376 /** 377 * Strips the magic underscore marking single token 378 * modes. 379 * @param string $mode Mode to decode. 380 * @return string Underlying mode name. 381 * @access private 382 */ 383 function _decodeSpecial($mode) { 384 return substr($mode, 1); 385 } 386 387 /** 388 * Calls the parser method named after the current 389 * mode. Empty content will be ignored. The lexer 390 * has a parser handler for each mode in the lexer. 391 * @param string $content Text parsed. 392 * @param boolean $is_match Token is recognised rather 393 * than unparsed data. 394 * @access private 395 */ 396 function _invokeParser($content, $is_match) { 397 if (($content === "") || ($content === false)) { 398 return true; 399 } 400 $handler = $this->_mode->getCurrent(); 401 if (isset($this->_mode_handlers[$handler])) { 402 $handler = $this->_mode_handlers[$handler]; 403 } 404 return $this->_parser->$handler($content, $is_match); 405 } 406 407 /** 408 * Tries to match a chunk of text and if successful 409 * removes the recognised chunk and any leading 410 * unparsed data. Empty strings will not be matched. 411 * @param string $raw The subject to parse. This is the 412 * content that will be eaten. 413 * @return array Three item list of unparsed 414 * content followed by the 415 * recognised token and finally the 416 * action the parser is to take. 417 * True if no match, false if there 418 * is a parsing error. 419 * @access private 420 */ 421 function _reduce(&$raw) { 422 if (! isset($this->_regexes[$this->_mode->getCurrent()])) { 423 return false; 424 } 425 if ($raw === "") { 426 return true; 427 } 428 if ($action = $this->_regexes[$this->_mode->getCurrent()]->match($raw, $match)) { 429 $unparsed_character_count = strpos($raw, $match); 430 $unparsed = substr($raw, 0, $unparsed_character_count); 431 $raw = substr($raw, $unparsed_character_count + strlen($match)); 432 return array($unparsed, $match, $action); 433 } 434 return true; 435 } 436 } 437 ?>
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
| Generated: Fri Nov 26 11:42:46 2004 | Cross-referenced by PHPXref 0.6 |