summaryrefslogtreecommitdiff
path: root/languages/utils/CLDRPluralRuleConverter.php
blob: 2eabcab14b4ec41bad93ece4a0ea1d173865c227 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
<?php
/**
 * @author Niklas Laxström, Tim Starling
 *
 * @copyright Copyright © 2010-2012, Niklas Laxström
 * @license http://www.gnu.org/copyleft/gpl.html GNU General Public License 2.0 or later
 *
 * @file
 * @since 1.20
 */

/**
 * Helper class for converting rules to reverse polish notation (RPN).
 */
class CLDRPluralRuleConverter {
	/**
	 * The input string
	 *
	 * @var string
	 */
	public $rule;

	/**
	 * The current position
	 *
	 * @var int
	 */
	public $pos;

	/**
	 * The past-the-end position
	 *
	 * @var int
	 */
	public $end;

	/**
	 * The operator stack
	 *
	 * @var array
	 */
	public $operators = array();

	/**
	 * The operand stack
	 *
	 * @var array
	 */
	public $operands = array();

	/**
	 * Precedence levels. Note that there's no need to worry about associativity
	 * for the level 4 operators, since they return boolean and don't accept
	 * boolean inputs.
	 */
	private static $precedence = array(
		'or' => 2,
		'and' => 3,
		'is' => 4,
		'is-not' => 4,
		'in' => 4,
		'not-in' => 4,
		'within' => 4,
		'not-within' => 4,
		'mod' => 5,
		',' => 6,
		'..' => 7,
	);

	/**
	 * A character list defining whitespace, for use in strspn() etc.
	 */
	const WHITESPACE_CLASS = " \t\r\n";

	/**
	 * Same for digits. Note that the grammar given in UTS #35 doesn't allow
	 * negative numbers or decimal separators.
	 */
	const NUMBER_CLASS = '0123456789';

	/**
	 * A character list of symbolic operands.
	 */
	const OPERAND_SYMBOLS = 'nivwft';

	/**
	 * An anchored regular expression which matches a word at the current offset.
	 */
	const WORD_REGEX = '/[a-zA-Z@]+/A';

	/**
	 * Convert a rule to RPN. This is the only public entry point.
	 *
	 * @param string $rule The rule to convert
	 * @return string The RPN representation of the rule
	 */
	public static function convert( $rule ) {
		$parser = new self( $rule );

		return $parser->doConvert();
	}

	/**
	 * Private constructor.
	 * @param string $rule
	 */
	protected function __construct( $rule ) {
		$this->rule = $rule;
		$this->pos = 0;
		$this->end = strlen( $rule );
	}

	/**
	 * Do the operation.
	 *
	 * @return string The RPN representation of the rule (e.g. "5 3 mod n is")
	 */
	protected function doConvert() {
		$expectOperator = true;

		// Iterate through all tokens, saving the operators and operands to a
		// stack per Dijkstra's shunting yard algorithm.
		/** @var CLDRPluralRuleConverterOperator $token */
		while ( false !== ( $token = $this->nextToken() ) ) {
			// In this grammar, there are only binary operators, so every valid
			// rule string will alternate between operator and operand tokens.
			$expectOperator = !$expectOperator;

			if ( $token instanceof CLDRPluralRuleConverterExpression ) {
				// Operand
				if ( $expectOperator ) {
					$token->error( 'unexpected operand' );
				}
				$this->operands[] = $token;
				continue;
			} else {
				// Operator
				if ( !$expectOperator ) {
					$token->error( 'unexpected operator' );
				}
				// Resolve higher precedence levels
				$lastOp = end( $this->operators );
				while ( $lastOp && self::$precedence[$token->name] <= self::$precedence[$lastOp->name] ) {
					$this->doOperation( $lastOp, $this->operands );
					array_pop( $this->operators );
					$lastOp = end( $this->operators );
				}
				$this->operators[] = $token;
			}
		}

		// Finish off the stack
		while ( $op = array_pop( $this->operators ) ) {
			$this->doOperation( $op, $this->operands );
		}

		// Make sure the result is sane. The first case is possible for an empty
		// string input, the second should be unreachable.
		if ( !count( $this->operands ) ) {
			$this->error( 'condition expected' );
		} elseif ( count( $this->operands ) > 1 ) {
			$this->error( 'missing operator or too many operands' );
		}

		$value = $this->operands[0];
		if ( $value->type !== 'boolean' ) {
			$this->error( 'the result must have a boolean type' );
		}

		return $this->operands[0]->rpn;
	}

	/**
	 * Fetch the next token from the input string.
	 *
	 * @return CLDRPluralRuleConverterFragment The next token
	 */
	protected function nextToken() {
		if ( $this->pos >= $this->end ) {
			return false;
		}

		// Whitespace
		$length = strspn( $this->rule, self::WHITESPACE_CLASS, $this->pos );
		$this->pos += $length;

		if ( $this->pos >= $this->end ) {
			return false;
		}

		// Number
		$length = strspn( $this->rule, self::NUMBER_CLASS, $this->pos );
		if ( $length !== 0 ) {
			$token = $this->newNumber( substr( $this->rule, $this->pos, $length ), $this->pos );
			$this->pos += $length;

			return $token;
		}

		// Two-character operators
		$op2 = substr( $this->rule, $this->pos, 2 );
		if ( $op2 === '..' || $op2 === '!=' ) {
			$token = $this->newOperator( $op2, $this->pos, 2 );
			$this->pos += 2;

			return $token;
		}

		// Single-character operators
		$op1 = $this->rule[$this->pos];
		if ( $op1 === ',' || $op1 === '=' || $op1 === '%' ) {
			$token = $this->newOperator( $op1, $this->pos, 1 );
			$this->pos++;

			return $token;
		}

		// Word
		if ( !preg_match( self::WORD_REGEX, $this->rule, $m, 0, $this->pos ) ) {
			$this->error( 'unexpected character "' . $this->rule[$this->pos] . '"' );
		}
		$word1 = strtolower( $m[0] );
		$word2 = '';
		$nextTokenPos = $this->pos + strlen( $word1 );
		if ( $word1 === 'not' || $word1 === 'is' ) {
			// Look ahead one word
			$nextTokenPos += strspn( $this->rule, self::WHITESPACE_CLASS, $nextTokenPos );
			if ( $nextTokenPos < $this->end
				&& preg_match( self::WORD_REGEX, $this->rule, $m, 0, $nextTokenPos )
			) {
				$word2 = strtolower( $m[0] );
				$nextTokenPos += strlen( $word2 );
			}
		}

		// Two-word operators like "is not" take precedence over single-word operators like "is"
		if ( $word2 !== '' ) {
			$bothWords = "{$word1}-{$word2}";
			if ( isset( self::$precedence[$bothWords] ) ) {
				$token = $this->newOperator( $bothWords, $this->pos, $nextTokenPos - $this->pos );
				$this->pos = $nextTokenPos;

				return $token;
			}
		}

		// Single-word operators
		if ( isset( self::$precedence[$word1] ) ) {
			$token = $this->newOperator( $word1, $this->pos, strlen( $word1 ) );
			$this->pos += strlen( $word1 );

			return $token;
		}

		// The single-character operand symbols
		if ( strpos( self::OPERAND_SYMBOLS, $word1 ) !== false ) {
			$token = $this->newNumber( $word1, $this->pos );
			$this->pos++;

			return $token;
		}

		// Samples
		if ( $word1 === '@integer' || $word1 === '@decimal' ) {
			// Samples are like comments, they have no effect on rule evaluation.
			// They run from the first sample indicator to the end of the string.
			$this->pos = $this->end;

			return false;
		}

		$this->error( 'unrecognised word' );
	}

	/**
	 * For the binary operator $op, pop its operands off the stack and push
	 * a fragment with rpn and type members describing the result of that
	 * operation.
	 *
	 * @param CLDRPluralRuleConverterOperator $op
	 */
	protected function doOperation( $op ) {
		if ( count( $this->operands ) < 2 ) {
			$op->error( 'missing operand' );
		}
		$right = array_pop( $this->operands );
		$left = array_pop( $this->operands );
		$result = $op->operate( $left, $right );
		$this->operands[] = $result;
	}

	/**
	 * Create a numerical expression object
	 *
	 * @param string $text
	 * @param int $pos
	 * @return CLDRPluralRuleConverterExpression The numerical expression
	 */
	protected function newNumber( $text, $pos ) {
		return new CLDRPluralRuleConverterExpression( $this, 'number', $text, $pos, strlen( $text ) );
	}

	/**
	 * Create a binary operator
	 *
	 * @param string $type
	 * @param int $pos
	 * @param int $length
	 * @return CLDRPluralRuleConverterOperator The operator
	 */
	protected function newOperator( $type, $pos, $length ) {
		return new CLDRPluralRuleConverterOperator( $this, $type, $pos, $length );
	}

	/**
	 * Throw an error
	 * @param string $message
	 */
	protected function error( $message ) {
		throw new CLDRPluralRuleError( $message );
	}
}