summaryrefslogtreecommitdiff
path: root/includes/normal
diff options
context:
space:
mode:
Diffstat (limited to 'includes/normal')
-rw-r--r--includes/normal/Makefile2
-rw-r--r--includes/normal/README10
-rw-r--r--includes/normal/RandomTest.php8
-rw-r--r--includes/normal/Utf8CaseGenerate.php6
-rw-r--r--includes/normal/Utf8Test.php7
-rw-r--r--includes/normal/UtfNormal.php36
-rw-r--r--includes/normal/UtfNormalBench.php12
-rw-r--r--includes/normal/UtfNormalDefines.php4
-rw-r--r--includes/normal/UtfNormalGenerate.php4
-rw-r--r--includes/normal/UtfNormalMemStress.php10
-rw-r--r--includes/normal/UtfNormalTest.php16
-rw-r--r--includes/normal/UtfNormalTest2.php6
-rw-r--r--includes/normal/UtfNormalUtil.php14
13 files changed, 67 insertions, 68 deletions
diff --git a/includes/normal/Makefile b/includes/normal/Makefile
index f0c340f6..66348ee3 100644
--- a/includes/normal/Makefile
+++ b/includes/normal/Makefile
@@ -8,7 +8,7 @@
# Explicitly using Unicode 6.0
BASE=http://www.unicode.org/Public/6.0.0/ucd
-# Can override to php-cli or php5 or whatevah
+# Can override to php-cli or php5 or whatever
PHP=php
#PHP=php-cli
diff --git a/includes/normal/README b/includes/normal/README
index a17aa7da..0f718d2c 100644
--- a/includes/normal/README
+++ b/includes/normal/README
@@ -48,12 +48,12 @@ grains of salt.
There's an experimental PHP extension module which wraps the ICU library's
normalization functions. This is *MUCH* faster than doing this work in pure
-PHP code. This is in the 'normal' directory in MediaWiki's CVS extensions
-module. It is known to work with PHP 4.3.8 and 5.0.2 on Linux/x86 but hasn't
-been thoroughly tested on other configurations.
+PHP code. This is at https://git.wikimedia.org/summary/mediawiki%2Fextensions%2Fnormal.git.
+It is used by the WMF, which currently runs PHP 5.3.10 on Linux. It hasn't been
+thoroughly tested on other configurations, but may work.
If the php_normal.so module is loaded in php.ini, the normalization functions
will automatically use it. If you can't (or don't want to) load it in php.ini,
-you may be able to load it using the dl() function before include()ing or
-require()ing UtfNormal.php, and it will be picked up.
+you may be able to load it using the dl() function before the inclusion of
+UtfNormal.php, and it will be picked up.
diff --git a/includes/normal/RandomTest.php b/includes/normal/RandomTest.php
index 23471e94..06029868 100644
--- a/includes/normal/RandomTest.php
+++ b/includes/normal/RandomTest.php
@@ -26,15 +26,15 @@
* @ingroup UtfNormal
*/
-if( php_sapi_name() != 'cli' ) {
+if( PHP_SAPI != 'cli' ) {
die( "Run me from the command line please.\n" );
}
/** */
-require_once( 'UtfNormal.php' );
-require_once( '../diff/DifferenceEngine.php' );
+require_once 'UtfNormal.php';
+require_once '../diff/DifferenceEngine.php';
-dl('php_utfnormal.so' );
+dl( 'php_utfnormal.so' );
# mt_srand( 99999 );
diff --git a/includes/normal/Utf8CaseGenerate.php b/includes/normal/Utf8CaseGenerate.php
index 368d0bcd..adc3ef22 100644
--- a/includes/normal/Utf8CaseGenerate.php
+++ b/includes/normal/Utf8CaseGenerate.php
@@ -25,7 +25,7 @@
* @ingroup UtfNormal
*/
-if( php_sapi_name() != 'cli' ) {
+if( PHP_SAPI != 'cli' ) {
die( "Run me from the command line please.\n" );
}
@@ -49,7 +49,7 @@ while( false !== ($line = fgets( $in ) ) ) {
$name = $columns[1];
$simpleUpper = $columns[12];
$simpleLower = $columns[13];
-
+
$source = codepointToUtf8( hexdec( $codepoint ) );
if( $simpleUpper ) {
$wikiUpperChars[$source] = codepointToUtf8( hexdec( $simpleUpper ) );
@@ -60,7 +60,7 @@ while( false !== ($line = fgets( $in ) ) ) {
}
fclose( $in );
-$out = fopen("Utf8Case.php", "wt");
+$out = fopen( "Utf8Case.php", "wt" );
if( $out ) {
$outUpperChars = escapeArray( $wikiUpperChars );
$outLowerChars = escapeArray( $wikiLowerChars );
diff --git a/includes/normal/Utf8Test.php b/includes/normal/Utf8Test.php
index 6eae6e72..c5c1be59 100644
--- a/includes/normal/Utf8Test.php
+++ b/includes/normal/Utf8Test.php
@@ -27,6 +27,10 @@
/** */
+if ( PHP_SAPI != 'cli' ) {
+ die( "Run me from the command line please.\n" );
+}
+
require_once 'UtfNormalDefines.php';
require_once 'UtfNormalUtil.php';
require_once 'UtfNormal.php';
@@ -34,9 +38,6 @@ mb_internal_encoding( "utf-8" );
$verbose = false;
#$verbose = true;
-if( php_sapi_name() != 'cli' ) {
- die( "Run me from the command line please.\n" );
-}
$in = fopen( "UTF-8-test.txt", "rt" );
if( !$in ) {
diff --git a/includes/normal/UtfNormal.php b/includes/normal/UtfNormal.php
index 08f85bd3..5a091afc 100644
--- a/includes/normal/UtfNormal.php
+++ b/includes/normal/UtfNormal.php
@@ -37,7 +37,7 @@ define( 'NORMALIZE_INTL', function_exists( 'normalizer_normalize' ) );
*
* Not as fast as I'd like, but should be usable for most purposes.
* UtfNormal::toNFC() will bail early if given ASCII text or text
- * it can quickly deterimine is already normalized.
+ * it can quickly determine is already normalized.
*
* All functions can be called static.
*
@@ -73,7 +73,7 @@ class UtfNormal {
* Fast return for pure ASCII strings; some lesser optimizations for
* strings containing only known-good characters. Not as fast as toNFC().
*
- * @param $string String: a UTF-8 string
+ * @param string $string a UTF-8 string
* @return string a clean, shiny, normalized UTF-8 string
*/
static function cleanUp( $string ) {
@@ -114,7 +114,7 @@ class UtfNormal {
* Fast return for pure ASCII strings; some lesser optimizations for
* strings containing only known-good characters.
*
- * @param $string String: a valid UTF-8 string. Input is not validated.
+ * @param string $string a valid UTF-8 string. Input is not validated.
* @return string a UTF-8 string in normal form C
*/
static function toNFC( $string ) {
@@ -132,7 +132,7 @@ class UtfNormal {
* Convert a UTF-8 string to normal form D, canonical decomposition.
* Fast return for pure ASCII strings.
*
- * @param $string String: a valid UTF-8 string. Input is not validated.
+ * @param string $string a valid UTF-8 string. Input is not validated.
* @return string a UTF-8 string in normal form D
*/
static function toNFD( $string ) {
@@ -151,7 +151,7 @@ class UtfNormal {
* This may cause irreversible information loss, use judiciously.
* Fast return for pure ASCII strings.
*
- * @param $string String: a valid UTF-8 string. Input is not validated.
+ * @param string $string a valid UTF-8 string. Input is not validated.
* @return string a UTF-8 string in normal form KC
*/
static function toNFKC( $string ) {
@@ -170,7 +170,7 @@ class UtfNormal {
* This may cause irreversible information loss, use judiciously.
* Fast return for pure ASCII strings.
*
- * @param $string String: a valid UTF-8 string. Input is not validated.
+ * @param string $string a valid UTF-8 string. Input is not validated.
* @return string a UTF-8 string in normal form KD
*/
static function toNFKD( $string ) {
@@ -190,14 +190,14 @@ class UtfNormal {
*/
static function loadData() {
if( !isset( self::$utfCombiningClass ) ) {
- require_once( __DIR__ . '/UtfNormalData.inc' );
+ require_once __DIR__ . '/UtfNormalData.inc';
}
}
/**
* Returns true if the string is _definitely_ in NFC.
* Returns false if not or uncertain.
- * @param $string String: a valid UTF-8 string. Input is not validated.
+ * @param string $string a valid UTF-8 string. Input is not validated.
* @return bool
*/
static function quickIsNFC( $string ) {
@@ -237,7 +237,7 @@ class UtfNormal {
/**
* Returns true if the string is _definitely_ in NFC.
* Returns false if not or uncertain.
- * @param $string String: a UTF-8 string, altered on output to be valid UTF-8 safe for XML.
+ * @param string $string a UTF-8 string, altered on output to be valid UTF-8 safe for XML.
* @return bool
*/
static function quickIsNFCVerify( &$string ) {
@@ -491,7 +491,7 @@ class UtfNormal {
*/
static function NFKD( $string ) {
if( !isset( self::$utfCompatibilityDecomp ) ) {
- require_once( 'UtfNormalDataK.inc' );
+ require_once 'UtfNormalDataK.inc';
}
return self::fastCombiningSort(
self::fastDecompose( $string, self::$utfCompatibilityDecomp ) );
@@ -503,8 +503,8 @@ class UtfNormal {
* (depending on which decomposition map is passed to us).
* Input is assumed to be *valid* UTF-8. Invalid code will break.
* @private
- * @param $string String: valid UTF-8 string
- * @param $map Array: hash of expanded decomposition map
+ * @param string $string valid UTF-8 string
+ * @param array $map hash of expanded decomposition map
* @return string a UTF-8 string decomposed, not yet normalized (needs sorting)
*/
static function fastDecompose( $string, $map ) {
@@ -564,7 +564,7 @@ class UtfNormal {
* Sorts combining characters into canonical order. This is the
* final step in creating decomposed normal forms D and KD.
* @private
- * @param $string String: a valid, decomposed UTF-8 string. Input is not validated.
+ * @param string $string a valid, decomposed UTF-8 string. Input is not validated.
* @return string a UTF-8 string with combining characters sorted in canonical order
*/
static function fastCombiningSort( $string ) {
@@ -616,7 +616,7 @@ class UtfNormal {
* Produces canonically composed sequences, i.e. normal form C or KC.
*
* @private
- * @param $string String: a valid UTF-8 string in sorted normal form D or KD. Input is not validated.
+ * @param string $string a valid UTF-8 string in sorted normal form D or KD. Input is not validated.
* @return string a UTF-8 string with canonical precomposed characters used where possible
*/
static function fastCompose( $string ) {
@@ -627,8 +627,8 @@ class UtfNormal {
$lastHangul = 0;
$startChar = '';
$combining = '';
- $x1 = ord(substr(UTF8_HANGUL_VBASE,0,1));
- $x2 = ord(substr(UTF8_HANGUL_TEND,0,1));
+ $x1 = ord(substr(UTF8_HANGUL_VBASE, 0, 1));
+ $x2 = ord(substr(UTF8_HANGUL_TEND, 0, 1));
for( $i = 0; $i < $len; $i++ ) {
$c = $string[$i];
$n = ord( $c );
@@ -762,10 +762,10 @@ class UtfNormal {
* Function to replace some characters that we don't want
* but most of the native normalize functions keep.
*
- * @param $string String The string
+ * @param string $string The string
* @return String String with the character codes replaced.
*/
- private static function replaceForNativeNormalize( $string ) {
+ private static function replaceForNativeNormalize( $string ) {
$string = preg_replace(
'/[\x00-\x08\x0b\x0c\x0e-\x1f]/',
UTF8_REPLACEMENT,
diff --git a/includes/normal/UtfNormalBench.php b/includes/normal/UtfNormalBench.php
index 944c4435..89de9290 100644
--- a/includes/normal/UtfNormalBench.php
+++ b/includes/normal/UtfNormalBench.php
@@ -19,11 +19,15 @@
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* http://www.gnu.org/copyleft/gpl.html
- *
+ *
* @file
* @ingroup UtfNormal
*/
+if( PHP_SAPI != 'cli' ) {
+ die( "Run me from the command line please.\n" );
+}
+
if( isset( $_SERVER['argv'] ) && in_array( '--icu', $_SERVER['argv'] ) ) {
dl( 'php_utfnormal.so' );
}
@@ -34,10 +38,6 @@ require_once 'UtfNormal.php';
define( 'BENCH_CYCLES', 5 );
-if( php_sapi_name() != 'cli' ) {
- die( "Run me from the command line please.\n" );
-}
-
$testfiles = array(
'testdata/washington.txt' => 'English text',
'testdata/berlin.txt' => 'German text',
@@ -80,7 +80,7 @@ function benchmarkTest( &$u, $filename, $desc ) {
}
}
-function benchTime(){
+function benchTime() {
$st = explode( ' ', microtime() );
return (float)$st[0] + (float)$st[1];
}
diff --git a/includes/normal/UtfNormalDefines.php b/includes/normal/UtfNormalDefines.php
index 5142a414..b07e3399 100644
--- a/includes/normal/UtfNormalDefines.php
+++ b/includes/normal/UtfNormalDefines.php
@@ -2,7 +2,7 @@
/**
* Some constant definitions for the unicode normalization module.
*
- * Note: these constants must all be resolvable at compile time by HipHop,
+ * Note: these constants must all be resolvable at compile time by HipHop,
* since this file will not be executed during request startup for a compiled
* MediaWiki.
*
@@ -26,7 +26,7 @@
*/
define( 'UNICODE_HANGUL_FIRST', 0xac00 );
-define( 'UNICODE_HANGUL_LAST', 0xd7a3 );
+define( 'UNICODE_HANGUL_LAST', 0xd7a3 );
define( 'UNICODE_HANGUL_LBASE', 0x1100 );
define( 'UNICODE_HANGUL_VBASE', 0x1161 );
diff --git a/includes/normal/UtfNormalGenerate.php b/includes/normal/UtfNormalGenerate.php
index e4c1138e..f392df52 100644
--- a/includes/normal/UtfNormalGenerate.php
+++ b/includes/normal/UtfNormalGenerate.php
@@ -25,7 +25,7 @@
* @ingroup UtfNormal
*/
-if( php_sapi_name() != 'cli' ) {
+if( PHP_SAPI != 'cli' ) {
die( "Run me from the command line please.\n" );
}
@@ -177,7 +177,7 @@ if( $out ) {
*
* @file
*/
-
+
UtfNormal::\$utfCombiningClass = unserialize( '$serCombining' );
UtfNormal::\$utfCanonicalComp = unserialize( '$serComp' );
UtfNormal::\$utfCanonicalDecomp = unserialize( '$serCanon' );
diff --git a/includes/normal/UtfNormalMemStress.php b/includes/normal/UtfNormalMemStress.php
index 1277dc20..9732d762 100644
--- a/includes/normal/UtfNormalMemStress.php
+++ b/includes/normal/UtfNormalMemStress.php
@@ -26,6 +26,10 @@
* @ingroup UtfNormal
*/
+if( PHP_SAPI != 'cli' ) {
+ die( "Run me from the command line please.\n" );
+}
+
if( isset( $_SERVER['argv'] ) && in_array( '--icu', $_SERVER['argv'] ) ) {
dl( 'php_utfnormal.so' );
}
@@ -38,10 +42,6 @@ define( 'BENCH_CYCLES', 1 );
define( 'BIGSIZE', 1024 * 1024 * 10); // 10m
ini_set('memory_limit', BIGSIZE + 120 * 1024 * 1024);
-if( php_sapi_name() != 'cli' ) {
- die( "Run me from the command line please.\n" );
-}
-
$testfiles = array(
'testdata/washington.txt' => 'English text',
'testdata/berlin.txt' => 'German text',
@@ -82,7 +82,7 @@ function benchmarkTest( &$u, $filename, $desc ) {
}
}
-function benchTime(){
+function benchTime() {
$st = explode( ' ', microtime() );
return (float)$st[0] + (float)$st[1];
}
diff --git a/includes/normal/UtfNormalTest.php b/includes/normal/UtfNormalTest.php
index 5872ec34..51183666 100644
--- a/includes/normal/UtfNormalTest.php
+++ b/includes/normal/UtfNormalTest.php
@@ -25,14 +25,16 @@
* @ingroup UtfNormal
*/
+if( PHP_SAPI != 'cli' ) {
+ die( "Run me from the command line please.\n" );
+}
+
$verbose = true;
#define( 'PRETTY_UTF8', true );
if( defined( 'PRETTY_UTF8' ) ) {
function pretty( $string ) {
- return preg_replace( '/([\x00-\xff])/e',
- 'sprintf("%02X", ord("$1"))',
- $string );
+ return strtoupper( bin2hex( $string ) );
}
} else {
/**
@@ -40,9 +42,7 @@ if( defined( 'PRETTY_UTF8' ) ) {
* @return string
*/
function pretty( $string ) {
- return trim( preg_replace( '/(.)/use',
- 'sprintf("%04X ", utf8ToCodepoint("$1"))',
- $string ) );
+ return strtoupper( utf8ToHexSequence( $string ) );
}
}
@@ -54,10 +54,6 @@ require_once 'UtfNormalDefines.php';
require_once 'UtfNormalUtil.php';
require_once 'UtfNormal.php';
-if( php_sapi_name() != 'cli' ) {
- die( "Run me from the command line please.\n" );
-}
-
$in = fopen("NormalizationTest.txt", "rt");
if( !$in ) {
print "Couldn't open NormalizationTest.txt -- can't run tests.\n";
diff --git a/includes/normal/UtfNormalTest2.php b/includes/normal/UtfNormalTest2.php
index 691bfaa7..750c0099 100644
--- a/includes/normal/UtfNormalTest2.php
+++ b/includes/normal/UtfNormalTest2.php
@@ -1,4 +1,4 @@
-#!/usr/bin/php
+#!/usr/bin/env php
<?php
/**
* Other tests for the unicode normalization module.
@@ -22,7 +22,7 @@
* @ingroup UtfNormal
*/
-if( php_sapi_name() != 'cli' ) {
+if( PHP_SAPI != 'cli' ) {
die( "Run me from the command line please.\n" );
}
@@ -65,7 +65,7 @@ $f = fopen($file, "r");
later and slow down the runtime.
*/
-require_once("./UtfNormal.php");
+require_once './UtfNormal.php';
function normalize_form_c($c) { return UtfNormal::toNFC($c); }
function normalize_form_d($c) { return UtfNormal::toNFD($c); }
function normalize_form_kc($c) { return UtfNormal::toNFKC($c); }
diff --git a/includes/normal/UtfNormalUtil.php b/includes/normal/UtfNormalUtil.php
index bfad7095..e8fec936 100644
--- a/includes/normal/UtfNormalUtil.php
+++ b/includes/normal/UtfNormalUtil.php
@@ -71,14 +71,16 @@ function hexSequenceToUtf8( $sequence ) {
* Take a UTF-8 string and return a space-separated series of hex
* numbers representing Unicode code points. For debugging.
*
- * @param $str String: UTF-8 string.
+ * @param string $str UTF-8 string.
* @return string
* @private
*/
function utf8ToHexSequence( $str ) {
- return rtrim( preg_replace( '/(.)/uSe',
- 'sprintf("%04x ", utf8ToCodepoint("$1"))',
- $str ) );
+ $buf = '';
+ foreach ( preg_split( '//u', $str, -1, PREG_SPLIT_NO_EMPTY ) as $cp ) {
+ $buf .= sprintf( '%04x ', utf8ToCodepoint( $cp ) );
+ }
+ return rtrim( $buf );
}
/**
@@ -114,7 +116,7 @@ function utf8ToCodepoint( $char ) {
$z >>= $length;
# Add in the free bits from subsequent bytes
- for ( $i=1; $i<$length; $i++ ) {
+ for ( $i=1; $i < $length; $i++ ) {
$z <<= 6;
$z |= ord( $char[$i] ) & 0x3f;
}
@@ -125,7 +127,7 @@ function utf8ToCodepoint( $char ) {
/**
* Escape a string for inclusion in a PHP single-quoted string literal.
*
- * @param $string String: string to be escaped.
+ * @param string $string string to be escaped.
* @return String: escaped string.
* @public
*/