diff options
Diffstat (limited to 'languages/utils/CLDRPluralRuleConverter.php')
-rw-r--r-- | languages/utils/CLDRPluralRuleConverter.php | 322 |
1 files changed, 322 insertions, 0 deletions
diff --git a/languages/utils/CLDRPluralRuleConverter.php b/languages/utils/CLDRPluralRuleConverter.php new file mode 100644 index 00000000..2eabcab1 --- /dev/null +++ b/languages/utils/CLDRPluralRuleConverter.php @@ -0,0 +1,322 @@ +<?php +/** + * @author Niklas Laxström, Tim Starling + * + * @copyright Copyright © 2010-2012, Niklas Laxström + * @license http://www.gnu.org/copyleft/gpl.html GNU General Public License 2.0 or later + * + * @file + * @since 1.20 + */ + +/** + * Helper class for converting rules to reverse polish notation (RPN). + */ +class CLDRPluralRuleConverter { + /** + * The input string + * + * @var string + */ + public $rule; + + /** + * The current position + * + * @var int + */ + public $pos; + + /** + * The past-the-end position + * + * @var int + */ + public $end; + + /** + * The operator stack + * + * @var array + */ + public $operators = array(); + + /** + * The operand stack + * + * @var array + */ + public $operands = array(); + + /** + * Precedence levels. Note that there's no need to worry about associativity + * for the level 4 operators, since they return boolean and don't accept + * boolean inputs. + */ + private static $precedence = array( + 'or' => 2, + 'and' => 3, + 'is' => 4, + 'is-not' => 4, + 'in' => 4, + 'not-in' => 4, + 'within' => 4, + 'not-within' => 4, + 'mod' => 5, + ',' => 6, + '..' => 7, + ); + + /** + * A character list defining whitespace, for use in strspn() etc. + */ + const WHITESPACE_CLASS = " \t\r\n"; + + /** + * Same for digits. Note that the grammar given in UTS #35 doesn't allow + * negative numbers or decimal separators. + */ + const NUMBER_CLASS = '0123456789'; + + /** + * A character list of symbolic operands. + */ + const OPERAND_SYMBOLS = 'nivwft'; + + /** + * An anchored regular expression which matches a word at the current offset. + */ + const WORD_REGEX = '/[a-zA-Z@]+/A'; + + /** + * Convert a rule to RPN. This is the only public entry point. + * + * @param string $rule The rule to convert + * @return string The RPN representation of the rule + */ + public static function convert( $rule ) { + $parser = new self( $rule ); + + return $parser->doConvert(); + } + + /** + * Private constructor. + * @param string $rule + */ + protected function __construct( $rule ) { + $this->rule = $rule; + $this->pos = 0; + $this->end = strlen( $rule ); + } + + /** + * Do the operation. + * + * @return string The RPN representation of the rule (e.g. "5 3 mod n is") + */ + protected function doConvert() { + $expectOperator = true; + + // Iterate through all tokens, saving the operators and operands to a + // stack per Dijkstra's shunting yard algorithm. + /** @var CLDRPluralRuleConverterOperator $token */ + while ( false !== ( $token = $this->nextToken() ) ) { + // In this grammar, there are only binary operators, so every valid + // rule string will alternate between operator and operand tokens. + $expectOperator = !$expectOperator; + + if ( $token instanceof CLDRPluralRuleConverterExpression ) { + // Operand + if ( $expectOperator ) { + $token->error( 'unexpected operand' ); + } + $this->operands[] = $token; + continue; + } else { + // Operator + if ( !$expectOperator ) { + $token->error( 'unexpected operator' ); + } + // Resolve higher precedence levels + $lastOp = end( $this->operators ); + while ( $lastOp && self::$precedence[$token->name] <= self::$precedence[$lastOp->name] ) { + $this->doOperation( $lastOp, $this->operands ); + array_pop( $this->operators ); + $lastOp = end( $this->operators ); + } + $this->operators[] = $token; + } + } + + // Finish off the stack + while ( $op = array_pop( $this->operators ) ) { + $this->doOperation( $op, $this->operands ); + } + + // Make sure the result is sane. The first case is possible for an empty + // string input, the second should be unreachable. + if ( !count( $this->operands ) ) { + $this->error( 'condition expected' ); + } elseif ( count( $this->operands ) > 1 ) { + $this->error( 'missing operator or too many operands' ); + } + + $value = $this->operands[0]; + if ( $value->type !== 'boolean' ) { + $this->error( 'the result must have a boolean type' ); + } + + return $this->operands[0]->rpn; + } + + /** + * Fetch the next token from the input string. + * + * @return CLDRPluralRuleConverterFragment The next token + */ + protected function nextToken() { + if ( $this->pos >= $this->end ) { + return false; + } + + // Whitespace + $length = strspn( $this->rule, self::WHITESPACE_CLASS, $this->pos ); + $this->pos += $length; + + if ( $this->pos >= $this->end ) { + return false; + } + + // Number + $length = strspn( $this->rule, self::NUMBER_CLASS, $this->pos ); + if ( $length !== 0 ) { + $token = $this->newNumber( substr( $this->rule, $this->pos, $length ), $this->pos ); + $this->pos += $length; + + return $token; + } + + // Two-character operators + $op2 = substr( $this->rule, $this->pos, 2 ); + if ( $op2 === '..' || $op2 === '!=' ) { + $token = $this->newOperator( $op2, $this->pos, 2 ); + $this->pos += 2; + + return $token; + } + + // Single-character operators + $op1 = $this->rule[$this->pos]; + if ( $op1 === ',' || $op1 === '=' || $op1 === '%' ) { + $token = $this->newOperator( $op1, $this->pos, 1 ); + $this->pos++; + + return $token; + } + + // Word + if ( !preg_match( self::WORD_REGEX, $this->rule, $m, 0, $this->pos ) ) { + $this->error( 'unexpected character "' . $this->rule[$this->pos] . '"' ); + } + $word1 = strtolower( $m[0] ); + $word2 = ''; + $nextTokenPos = $this->pos + strlen( $word1 ); + if ( $word1 === 'not' || $word1 === 'is' ) { + // Look ahead one word + $nextTokenPos += strspn( $this->rule, self::WHITESPACE_CLASS, $nextTokenPos ); + if ( $nextTokenPos < $this->end + && preg_match( self::WORD_REGEX, $this->rule, $m, 0, $nextTokenPos ) + ) { + $word2 = strtolower( $m[0] ); + $nextTokenPos += strlen( $word2 ); + } + } + + // Two-word operators like "is not" take precedence over single-word operators like "is" + if ( $word2 !== '' ) { + $bothWords = "{$word1}-{$word2}"; + if ( isset( self::$precedence[$bothWords] ) ) { + $token = $this->newOperator( $bothWords, $this->pos, $nextTokenPos - $this->pos ); + $this->pos = $nextTokenPos; + + return $token; + } + } + + // Single-word operators + if ( isset( self::$precedence[$word1] ) ) { + $token = $this->newOperator( $word1, $this->pos, strlen( $word1 ) ); + $this->pos += strlen( $word1 ); + + return $token; + } + + // The single-character operand symbols + if ( strpos( self::OPERAND_SYMBOLS, $word1 ) !== false ) { + $token = $this->newNumber( $word1, $this->pos ); + $this->pos++; + + return $token; + } + + // Samples + if ( $word1 === '@integer' || $word1 === '@decimal' ) { + // Samples are like comments, they have no effect on rule evaluation. + // They run from the first sample indicator to the end of the string. + $this->pos = $this->end; + + return false; + } + + $this->error( 'unrecognised word' ); + } + + /** + * For the binary operator $op, pop its operands off the stack and push + * a fragment with rpn and type members describing the result of that + * operation. + * + * @param CLDRPluralRuleConverterOperator $op + */ + protected function doOperation( $op ) { + if ( count( $this->operands ) < 2 ) { + $op->error( 'missing operand' ); + } + $right = array_pop( $this->operands ); + $left = array_pop( $this->operands ); + $result = $op->operate( $left, $right ); + $this->operands[] = $result; + } + + /** + * Create a numerical expression object + * + * @param string $text + * @param int $pos + * @return CLDRPluralRuleConverterExpression The numerical expression + */ + protected function newNumber( $text, $pos ) { + return new CLDRPluralRuleConverterExpression( $this, 'number', $text, $pos, strlen( $text ) ); + } + + /** + * Create a binary operator + * + * @param string $type + * @param int $pos + * @param int $length + * @return CLDRPluralRuleConverterOperator The operator + */ + protected function newOperator( $type, $pos, $length ) { + return new CLDRPluralRuleConverterOperator( $this, $type, $pos, $length ); + } + + /** + * Throw an error + * @param string $message + */ + protected function error( $message ) { + throw new CLDRPluralRuleError( $message ); + } +} |