13 files changed, 67 insertions, 68 deletions
diff --git a/includes/normal/Makefile b/includes/normal/Makefile
index f0c340f6..66348ee3 100644
--- a/includes/normal/Makefile
+++ b/includes/normal/Makefile
@@ -8,7 +8,7 @@
 # Explicitly using Unicode 6.0
 BASE=http://www.unicode.org/Public/6.0.0/ucd
 
-# Can override to php-cli or php5 or whatevah
+# Can override to php-cli or php5 or whatever
 PHP=php
 #PHP=php-cli
 
diff --git a/includes/normal/README b/includes/normal/README
index a17aa7da..0f718d2c 100644
--- a/includes/normal/README
+++ b/includes/normal/README
@@ -48,12 +48,12 @@ grains of salt.
 
 There's an experimental PHP extension module which wraps the ICU library's
 normalization functions. This is *MUCH* faster than doing this work in pure
-PHP code. This is in the 'normal' directory in MediaWiki's CVS extensions
-module. It is known to work with PHP 4.3.8 and 5.0.2 on Linux/x86 but hasn't
-been thoroughly tested on other configurations.
+PHP code. This is at https://git.wikimedia.org/summary/mediawiki%2Fextensions%2Fnormal.git.
+It is used by the WMF, which currently runs PHP 5.3.10 on Linux.  It hasn't been
+thoroughly tested on other configurations, but may work.
 
 If the php_normal.so module is loaded in php.ini, the normalization functions
 will automatically use it. If you can't (or don't want to) load it in php.ini,
-you may be able to load it using the dl() function before include()ing or
-require()ing UtfNormal.php, and it will be picked up.
+you may be able to load it using the dl() function before the inclusion of
+UtfNormal.php, and it will be picked up.
 
diff --git a/includes/normal/RandomTest.php b/includes/normal/RandomTest.php
index 23471e94..06029868 100644
--- a/includes/normal/RandomTest.php
+++ b/includes/normal/RandomTest.php
@@ -26,15 +26,15 @@
  * @ingroup UtfNormal
  */
 
-if( php_sapi_name() != 'cli' ) {
+if( PHP_SAPI != 'cli' ) {
 	die( "Run me from the command line please.\n" );
 }
 
 /** */
-require_once( 'UtfNormal.php' );
-require_once( '../diff/DifferenceEngine.php' );
+require_once 'UtfNormal.php';
+require_once '../diff/DifferenceEngine.php';
 
-dl('php_utfnormal.so' );
+dl( 'php_utfnormal.so' );
 
 # mt_srand( 99999 );
 
diff --git a/includes/normal/Utf8CaseGenerate.php b/includes/normal/Utf8CaseGenerate.php
index 368d0bcd..adc3ef22 100644
--- a/includes/normal/Utf8CaseGenerate.php
+++ b/includes/normal/Utf8CaseGenerate.php
@@ -25,7 +25,7 @@
  * @ingroup UtfNormal
  */
 
-if( php_sapi_name() != 'cli' ) {
+if( PHP_SAPI != 'cli' ) {
 	die( "Run me from the command line please.\n" );
 }
 
@@ -49,7 +49,7 @@ while( false !== ($line = fgets( $in ) ) ) {
 	$name = $columns[1];
 	$simpleUpper = $columns[12];
 	$simpleLower = $columns[13];
-	
+
 	$source = codepointToUtf8( hexdec( $codepoint ) );
 	if( $simpleUpper ) {
 		$wikiUpperChars[$source] = codepointToUtf8( hexdec( $simpleUpper ) );
@@ -60,7 +60,7 @@ while( false !== ($line = fgets( $in ) ) ) {
 }
 fclose( $in );
 
-$out = fopen("Utf8Case.php", "wt");
+$out = fopen( "Utf8Case.php", "wt" );
 if( $out ) {
 	$outUpperChars = escapeArray( $wikiUpperChars );
 	$outLowerChars = escapeArray( $wikiLowerChars );
diff --git a/includes/normal/Utf8Test.php b/includes/normal/Utf8Test.php
index 6eae6e72..c5c1be59 100644
--- a/includes/normal/Utf8Test.php
+++ b/includes/normal/Utf8Test.php
@@ -27,6 +27,10 @@
 
 /** */
 
+if ( PHP_SAPI != 'cli' ) {
+	die( "Run me from the command line please.\n" );
+}
+
 require_once 'UtfNormalDefines.php';
 require_once 'UtfNormalUtil.php';
 require_once 'UtfNormal.php';
@@ -34,9 +38,6 @@ mb_internal_encoding( "utf-8" );
 
 $verbose = false;
 #$verbose = true;
-if( php_sapi_name() != 'cli' ) {
-	die( "Run me from the command line please.\n" );
-}
 
 $in = fopen( "UTF-8-test.txt", "rt" );
 if( !$in ) {
diff --git a/includes/normal/UtfNormal.php b/includes/normal/UtfNormal.php
index 08f85bd3..5a091afc 100644
--- a/includes/normal/UtfNormal.php
+++ b/includes/normal/UtfNormal.php
@@ -37,7 +37,7 @@ define( 'NORMALIZE_INTL', function_exists( 'normalizer_normalize' ) );
  *
  * Not as fast as I'd like, but should be usable for most purposes.
  * UtfNormal::toNFC() will bail early if given ASCII text or text
- * it can quickly deterimine is already normalized.
+ * it can quickly determine is already normalized.
  *
  * All functions can be called static.
  *
@@ -73,7 +73,7 @@ class UtfNormal {
 	 * Fast return for pure ASCII strings; some lesser optimizations for
 	 * strings containing only known-good characters. Not as fast as toNFC().
 	 *
-	 * @param $string String: a UTF-8 string
+	 * @param string $string a UTF-8 string
 	 * @return string a clean, shiny, normalized UTF-8 string
 	 */
 	static function cleanUp( $string ) {
@@ -114,7 +114,7 @@ class UtfNormal {
 	 * Fast return for pure ASCII strings; some lesser optimizations for
 	 * strings containing only known-good characters.
 	 *
-	 * @param $string String: a valid UTF-8 string. Input is not validated.
+	 * @param string $string a valid UTF-8 string. Input is not validated.
 	 * @return string a UTF-8 string in normal form C
 	 */
 	static function toNFC( $string ) {
@@ -132,7 +132,7 @@ class UtfNormal {
 	 * Convert a UTF-8 string to normal form D, canonical decomposition.
 	 * Fast return for pure ASCII strings.
 	 *
-	 * @param $string String: a valid UTF-8 string. Input is not validated.
+	 * @param string $string a valid UTF-8 string. Input is not validated.
 	 * @return string a UTF-8 string in normal form D
 	 */
 	static function toNFD( $string ) {
@@ -151,7 +151,7 @@ class UtfNormal {
 	 * This may cause irreversible information loss, use judiciously.
 	 * Fast return for pure ASCII strings.
 	 *
-	 * @param $string String: a valid UTF-8 string. Input is not validated.
+	 * @param string $string a valid UTF-8 string. Input is not validated.
 	 * @return string a UTF-8 string in normal form KC
 	 */
 	static function toNFKC( $string ) {
@@ -170,7 +170,7 @@ class UtfNormal {
 	 * This may cause irreversible information loss, use judiciously.
 	 * Fast return for pure ASCII strings.
 	 *
-	 * @param $string String: a valid UTF-8 string. Input is not validated.
+	 * @param string $string a valid UTF-8 string. Input is not validated.
 	 * @return string a UTF-8 string in normal form KD
 	 */
 	static function toNFKD( $string ) {
@@ -190,14 +190,14 @@ class UtfNormal {
 	 */
 	static function loadData() {
 		if( !isset( self::$utfCombiningClass ) ) {
-			require_once( __DIR__ . '/UtfNormalData.inc' );
+			require_once __DIR__ . '/UtfNormalData.inc';
 		}
 	}
 
 	/**
 	 * Returns true if the string is _definitely_ in NFC.
 	 * Returns false if not or uncertain.
-	 * @param $string String: a valid UTF-8 string. Input is not validated.
+	 * @param string $string a valid UTF-8 string. Input is not validated.
 	 * @return bool
 	 */
 	static function quickIsNFC( $string ) {
@@ -237,7 +237,7 @@ class UtfNormal {
 	/**
 	 * Returns true if the string is _definitely_ in NFC.
 	 * Returns false if not or uncertain.
-	 * @param $string String: a UTF-8 string, altered on output to be valid UTF-8 safe for XML.
+	 * @param string $string a UTF-8 string, altered on output to be valid UTF-8 safe for XML.
 	 * @return bool
 	 */
 	static function quickIsNFCVerify( &$string ) {
@@ -491,7 +491,7 @@ class UtfNormal {
 	 */
 	static function NFKD( $string ) {
 		if( !isset( self::$utfCompatibilityDecomp ) ) {
-			require_once( 'UtfNormalDataK.inc' );
+			require_once 'UtfNormalDataK.inc';
 		}
 		return self::fastCombiningSort(
 			self::fastDecompose( $string, self::$utfCompatibilityDecomp ) );
@@ -503,8 +503,8 @@ class UtfNormal {
 	 * (depending on which decomposition map is passed to us).
 	 * Input is assumed to be *valid* UTF-8. Invalid code will break.
 	 * @private
-	 * @param $string String: valid UTF-8 string
-	 * @param $map Array: hash of expanded decomposition map
+	 * @param string $string valid UTF-8 string
+	 * @param array $map hash of expanded decomposition map
 	 * @return string a UTF-8 string decomposed, not yet normalized (needs sorting)
 	 */
 	static function fastDecompose( $string, $map ) {
@@ -564,7 +564,7 @@ class UtfNormal {
 	 * Sorts combining characters into canonical order. This is the
 	 * final step in creating decomposed normal forms D and KD.
 	 * @private
-	 * @param $string String: a valid, decomposed UTF-8 string. Input is not validated.
+	 * @param string $string a valid, decomposed UTF-8 string. Input is not validated.
 	 * @return string a UTF-8 string with combining characters sorted in canonical order
 	 */
 	static function fastCombiningSort( $string ) {
@@ -616,7 +616,7 @@ class UtfNormal {
 	 * Produces canonically composed sequences, i.e. normal form C or KC.
 	 *
 	 * @private
-	 * @param $string String: a valid UTF-8 string in sorted normal form D or KD. Input is not validated.
+	 * @param string $string a valid UTF-8 string in sorted normal form D or KD. Input is not validated.
 	 * @return string a UTF-8 string with canonical precomposed characters used where possible
 	 */
 	static function fastCompose( $string ) {
@@ -627,8 +627,8 @@ class UtfNormal {
 		$lastHangul = 0;
 		$startChar = '';
 		$combining = '';
-		$x1 = ord(substr(UTF8_HANGUL_VBASE,0,1));
-		$x2 = ord(substr(UTF8_HANGUL_TEND,0,1));
+		$x1 = ord(substr(UTF8_HANGUL_VBASE, 0, 1));
+		$x2 = ord(substr(UTF8_HANGUL_TEND, 0, 1));
 		for( $i = 0; $i < $len; $i++ ) {
 			$c = $string[$i];
 			$n = ord( $c );
@@ -762,10 +762,10 @@ class UtfNormal {
 	 * Function to replace some characters that we don't want
 	 * but most of the native normalize functions keep.
 	 *
-	 * @param $string String The string
+	 * @param string $string The string
 	 * @return String String with the character codes replaced.
 	 */
-	private static function replaceForNativeNormalize( $string ) { 
+	private static function replaceForNativeNormalize( $string ) {
 		$string = preg_replace(
 			'/[\x00-\x08\x0b\x0c\x0e-\x1f]/',
 			UTF8_REPLACEMENT,
diff --git a/includes/normal/UtfNormalBench.php b/includes/normal/UtfNormalBench.php
index 944c4435..89de9290 100644
--- a/includes/normal/UtfNormalBench.php
+++ b/includes/normal/UtfNormalBench.php
@@ -19,11 +19,15 @@
  * with this program; if not, write to the Free Software Foundation, Inc.,
  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  * http://www.gnu.org/copyleft/gpl.html
- * 
+ *
  * @file
  * @ingroup UtfNormal
  */
 
+if( PHP_SAPI != 'cli' ) {
+	die( "Run me from the command line please.\n" );
+}
+
 if( isset( $_SERVER['argv'] ) && in_array( '--icu', $_SERVER['argv'] ) ) {
 	dl( 'php_utfnormal.so' );
 }
@@ -34,10 +38,6 @@ require_once 'UtfNormal.php';
 
 define( 'BENCH_CYCLES', 5 );
 
-if( php_sapi_name() != 'cli' ) {
-	die( "Run me from the command line please.\n" );
-}
-
 $testfiles = array(
 	'testdata/washington.txt' => 'English text',
 	'testdata/berlin.txt' => 'German text',
@@ -80,7 +80,7 @@ function benchmarkTest( &$u, $filename, $desc ) {
 	}
 }
 
-function benchTime(){
+function benchTime() {
 	$st = explode( ' ', microtime() );
 	return (float)$st[0] + (float)$st[1];
 }
diff --git a/includes/normal/UtfNormalDefines.php b/includes/normal/UtfNormalDefines.php
index 5142a414..b07e3399 100644
--- a/includes/normal/UtfNormalDefines.php
+++ b/includes/normal/UtfNormalDefines.php
@@ -2,7 +2,7 @@
 /**
  * Some constant definitions for the unicode normalization module.
  *
- * Note: these constants must all be resolvable at compile time by HipHop, 
+ * Note: these constants must all be resolvable at compile time by HipHop,
  * since this file will not be executed during request startup for a compiled
  * MediaWiki.
  *
@@ -26,7 +26,7 @@
  */
 
 define( 'UNICODE_HANGUL_FIRST', 0xac00 );
-define( 'UNICODE_HANGUL_LAST',  0xd7a3 );
+define( 'UNICODE_HANGUL_LAST', 0xd7a3 );
 
 define( 'UNICODE_HANGUL_LBASE', 0x1100 );
 define( 'UNICODE_HANGUL_VBASE', 0x1161 );
diff --git a/includes/normal/UtfNormalGenerate.php b/includes/normal/UtfNormalGenerate.php
index e4c1138e..f392df52 100644
--- a/includes/normal/UtfNormalGenerate.php
+++ b/includes/normal/UtfNormalGenerate.php
@@ -25,7 +25,7 @@
  * @ingroup UtfNormal
  */
 
-if( php_sapi_name() != 'cli' ) {
+if( PHP_SAPI != 'cli' ) {
 	die( "Run me from the command line please.\n" );
 }
 
@@ -177,7 +177,7 @@ if( $out ) {
  *
  * @file
  */
- 
+
 UtfNormal::\$utfCombiningClass = unserialize( '$serCombining' );
 UtfNormal::\$utfCanonicalComp = unserialize( '$serComp' );
 UtfNormal::\$utfCanonicalDecomp = unserialize( '$serCanon' );
diff --git a/includes/normal/UtfNormalMemStress.php b/includes/normal/UtfNormalMemStress.php
index 1277dc20..9732d762 100644
--- a/includes/normal/UtfNormalMemStress.php
+++ b/includes/normal/UtfNormalMemStress.php
@@ -26,6 +26,10 @@
  * @ingroup UtfNormal
  */
 
+if( PHP_SAPI != 'cli' ) {
+	die( "Run me from the command line please.\n" );
+}
+
 if( isset( $_SERVER['argv'] ) && in_array( '--icu', $_SERVER['argv'] ) ) {
 	dl( 'php_utfnormal.so' );
 }
@@ -38,10 +42,6 @@ define( 'BENCH_CYCLES', 1 );
 define( 'BIGSIZE', 1024 * 1024 * 10); // 10m
 ini_set('memory_limit', BIGSIZE + 120 * 1024 * 1024);
 
-if( php_sapi_name() != 'cli' ) {
-	die( "Run me from the command line please.\n" );
-}
-
 $testfiles = array(
 	'testdata/washington.txt' => 'English text',
 	'testdata/berlin.txt' => 'German text',
@@ -82,7 +82,7 @@ function benchmarkTest( &$u, $filename, $desc ) {
 	}
 }
 
-function benchTime(){
+function benchTime() {
 	$st = explode( ' ', microtime() );
 	return (float)$st[0] + (float)$st[1];
 }
diff --git a/includes/normal/UtfNormalTest.php b/includes/normal/UtfNormalTest.php
index 5872ec34..51183666 100644
--- a/includes/normal/UtfNormalTest.php
+++ b/includes/normal/UtfNormalTest.php
@@ -25,14 +25,16 @@
  * @ingroup UtfNormal
  */
 
+if( PHP_SAPI != 'cli' ) {
+	die( "Run me from the command line please.\n" );
+}
+
 $verbose = true;
 #define( 'PRETTY_UTF8', true );
 
 if( defined( 'PRETTY_UTF8' ) ) {
 	function pretty( $string ) {
-		return preg_replace( '/([\x00-\xff])/e',
-			'sprintf("%02X", ord("$1"))',
-			$string );
+		return strtoupper( bin2hex( $string ) );
 	}
 } else {
 	/**
@@ -40,9 +42,7 @@ if( defined( 'PRETTY_UTF8' ) ) {
 	 * @return string
 	 */
 	function pretty( $string ) {
-		return trim( preg_replace( '/(.)/use',
-			'sprintf("%04X ", utf8ToCodepoint("$1"))',
-			$string ) );
+		return strtoupper( utf8ToHexSequence( $string ) );
 	}
 }
 
@@ -54,10 +54,6 @@ require_once 'UtfNormalDefines.php';
 require_once 'UtfNormalUtil.php';
 require_once 'UtfNormal.php';
 
-if( php_sapi_name() != 'cli' ) {
-	die( "Run me from the command line please.\n" );
-}
-
 $in = fopen("NormalizationTest.txt", "rt");
 if( !$in ) {
 	print "Couldn't open NormalizationTest.txt -- can't run tests.\n";
diff --git a/includes/normal/UtfNormalTest2.php b/includes/normal/UtfNormalTest2.php
index 691bfaa7..750c0099 100644
--- a/includes/normal/UtfNormalTest2.php
+++ b/includes/normal/UtfNormalTest2.php
@@ -1,4 +1,4 @@
-#!/usr/bin/php
+#!/usr/bin/env php
 <?php
 /**
  * Other tests for the unicode normalization module.
@@ -22,7 +22,7 @@
  * @ingroup UtfNormal
  */
 
-if( php_sapi_name() != 'cli' ) {
+if( PHP_SAPI != 'cli' ) {
 	die( "Run me from the command line please.\n" );
 }
 
@@ -65,7 +65,7 @@ $f = fopen($file, "r");
      later and slow down the runtime.
  */
 
-require_once("./UtfNormal.php");
+require_once './UtfNormal.php';
 function normalize_form_c($c)      { return UtfNormal::toNFC($c);  }
 function normalize_form_d($c)      { return UtfNormal::toNFD($c);  }
 function normalize_form_kc($c)     { return UtfNormal::toNFKC($c); }
diff --git a/includes/normal/UtfNormalUtil.php b/includes/normal/UtfNormalUtil.php
index bfad7095..e8fec936 100644
--- a/includes/normal/UtfNormalUtil.php
+++ b/includes/normal/UtfNormalUtil.php
@@ -71,14 +71,16 @@ function hexSequenceToUtf8( $sequence ) {
  * Take a UTF-8 string and return a space-separated series of hex
  * numbers representing Unicode code points. For debugging.
  *
- * @param $str String: UTF-8 string.
+ * @param string $str UTF-8 string.
  * @return string
  * @private
  */
 function utf8ToHexSequence( $str ) {
-	return rtrim( preg_replace( '/(.)/uSe',
-	                            'sprintf("%04x ", utf8ToCodepoint("$1"))',
-	                            $str ) );
+	$buf = '';
+	foreach ( preg_split( '//u', $str, -1, PREG_SPLIT_NO_EMPTY ) as $cp ) {
+		$buf .= sprintf( '%04x ', utf8ToCodepoint( $cp ) );
+	}
+	return rtrim( $buf );
 }
 
 /**
@@ -114,7 +116,7 @@ function utf8ToCodepoint( $char ) {
 	$z >>= $length;
 
 	# Add in the free bits from subsequent bytes
-	for ( $i=1; $i<$length; $i++ ) {
+	for ( $i=1; $i < $length; $i++ ) {
 		$z <<= 6;
 		$z |= ord( $char[$i] ) & 0x3f;
 	}
@@ -125,7 +127,7 @@ function utf8ToCodepoint( $char ) {
 /**
  * Escape a string for inclusion in a PHP single-quoted string literal.
  *
- * @param $string String: string to be escaped.
+ * @param string $string string to be escaped.
  * @return String: escaped string.
  * @public
  */