summaryrefslogtreecommitdiff
path: root/includes/libs/IPSet.php
blob: c1c841e634aac6d12597636ee07b7c37415645a0 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
<?php
/**
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License along
 * with this program; if not, write to the Free Software Foundation, Inc.,
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 * http://www.gnu.org/copyleft/gpl.html
 *
 * @file
 * @author Brandon Black <blblack@gmail.com>
 */

/**
 * Matches IP addresses against a set of CIDR specifications
 *
 * Usage:
 *   // At startup, calculate the optimized data structure for the set:
 *   $ipset = new IPSet( $wgSquidServersNoPurge );
 *   // runtime check against cached set (returns bool):
 *   $allowme = $ipset->match( $ip );
 *
 * In rough benchmarking, this takes about 80% more time than
 * in_array() checks on a short (a couple hundred at most) array
 * of addresses.  It's fast either way at those levels, though,
 * and IPSet would scale better than in_array if the array were
 * much larger.
 *
 * For mixed-family CIDR sets, however, this code gives well over
 * 100x speedup vs iterating IP::isInRange() over an array
 * of CIDR specs.
 *
 * The basic implementation is two separate binary trees
 * (IPv4 and IPv6) as nested php arrays with keys named 0 and 1.
 * The values false and true are terminal match-fail and match-success,
 * otherwise the value is a deeper node in the tree.
 *
 * A simple depth-compression scheme is also implemented: whole-byte
 * tree compression at whole-byte boundaries only, where no branching
 * occurs during that whole byte of depth.  A compressed node has
 * keys 'comp' (the byte to compare) and 'next' (the next node to
 * recurse into if 'comp' matched successfully).
 *
 * For example, given these inputs:
 * 25.0.0.0/9
 * 25.192.0.0/10
 *
 * The v4 tree would look like:
 * root4 => array(
 *     'comp' => 25,
 *     'next' => array(
 *         0 => true,
 *         1 => array(
 *             0 => false,
 *             1 => true,
 *         ),
 *     ),
 * );
 *
 * (multi-byte compression nodes were attempted as well, but were
 * a net loss in my test scenarios due to additional match complexity)
 *
 * @since 1.24
 */
class IPSet {
	/** @var array $root4: the root of the IPv4 matching tree */
	private $root4 = array( false, false );

	/** @var array $root6: the root of the IPv6 matching tree */
	private $root6 = array( false, false );

	/**
	 * __construct() instantiate the object from an array of CIDR specs
	 *
	 * @param array $cfg array of IPv[46] CIDR specs as strings
	 * @return IPSet new IPSet object
	 *
	 * Invalid input network/mask values in $cfg will result in issuing
	 * E_WARNING and/or E_USER_WARNING and the bad values being ignored.
	 */
	public function __construct( array $cfg ) {
		foreach ( $cfg as $cidr ) {
			$this->addCidr( $cidr );
		}

		self::recOptimize( $this->root4 );
		self::recCompress( $this->root4, 0, 24 );
		self::recOptimize( $this->root6 );
		self::recCompress( $this->root6, 0, 120 );
	}

	/**
	 * Add a single CIDR spec to the internal matching trees
	 *
	 * @param string $cidr string CIDR spec, IPv[46], optional /mask (def all-1's)
	 */
	private function addCidr( $cidr ) {
		// v4 or v6 check
		if ( strpos( $cidr, ':' ) === false ) {
			$node =& $this->root4;
			$defMask = '32';
		} else {
			$node =& $this->root6;
			$defMask = '128';
		}

		// Default to all-1's mask if no netmask in the input
		if ( strpos( $cidr, '/' ) === false ) {
			$net = $cidr;
			$mask = $defMask;
		} else {
			list( $net, $mask ) = explode( '/', $cidr, 2 );
			if ( !ctype_digit( $mask ) || intval( $mask ) > $defMask ) {
				trigger_error( "IPSet: Bad mask '$mask' from '$cidr', ignored", E_USER_WARNING );
				return;
			}
		}
		$mask = intval( $mask ); // explicit integer convert, checked above

		// convert $net to an array of integer bytes, length 4 or 16:
		$raw = inet_pton( $net );
		if ( $raw === false ) {
			return; // inet_pton() sends an E_WARNING for us
		}
		$rawOrd = array_map( 'ord', str_split( $raw ) );

		// special-case: zero mask overwrites the whole tree with a pair of terminal successes
		if ( $mask == 0 ) {
			$node = array( true, true );
			return;
		}

		// iterate the bits of the address while walking the tree structure for inserts
		$curBit = 0;
		while ( 1 ) {
			$maskShift = 7 - ( $curBit & 7 );
			$node =& $node[( $rawOrd[$curBit >> 3] & ( 1 << $maskShift ) ) >> $maskShift];
			++$curBit;
			if ( $node === true ) {
				// already added a larger supernet, no need to go deeper
				return;
			} elseif ( $curBit == $mask ) {
				// this may wipe out deeper subnets from earlier
				$node = true;
				return;
			} elseif ( $node === false ) {
				// create new subarray to go deeper
				$node = array( false, false );
			}
		}
	}

	/**
	 * Match an IP address against the set
	 *
	 * @param string $ip string IPv[46] address
	 * @return bool true is match success, false is match failure
	 *
	 * If $ip is unparseable, inet_pton may issue an E_WARNING to that effect
	 */
	public function match( $ip ) {
		$raw = inet_pton( $ip );
		if ( $raw === false ) {
			return false; // inet_pton() sends an E_WARNING for us
		}

		$rawOrd = array_map( 'ord', str_split( $raw ) );
		if ( count( $rawOrd ) == 4 ) {
			$node =& $this->root4;
		} else {
			$node =& $this->root6;
		}

		$curBit = 0;
		while ( 1 ) {
			if ( isset( $node['comp'] ) ) {
				// compressed node, matches 1 whole byte on a byte boundary
				if ( $rawOrd[$curBit >> 3] != $node['comp'] ) {
					return false;
				}
				$curBit += 8;
				$node =& $node['next'];
			} else {
				// uncompressed node, walk in the correct direction for the current bit-value
				$maskShift = 7 - ( $curBit & 7 );
				$node =& $node[( $rawOrd[$curBit >> 3] & ( 1 << $maskShift ) ) >> $maskShift];
				++$curBit;
			}

			if ( $node === true || $node === false ) {
				return $node;
			}
		}
	}

	/**
	 * Recursively merges adjacent nets into larger supernets
	 *
	 * @param array &$node Tree node to optimize, by-reference
	 *
	 *  e.g.: 8.0.0.0/8 + 9.0.0.0/8 -> 8.0.0.0/7
	 */
	private static function recOptimize( &$node ) {
		if ( $node[0] !== false && $node[0] !== true && self::recOptimize( $node[0] ) ) {
			$node[0] = true;
		}
		if ( $node[1] !== false && $node[1] !== true && self::recOptimize( $node[1] ) ) {
			$node[1] = true;
		}
		if ( $node[0] === true && $node[1] === true ) {
			return true;
		}
		return false;
	}

	/**
	 * Recursively compresses a tree
	 *
	 * @param array &$node Tree node to compress, by-reference
	 * @param integer $curBit current depth in the tree
	 * @param integer $maxCompStart maximum depth at which compression can start, family-specific
	 *
	 * This is a very simplistic compression scheme: if we go through a whole
	 * byte of address starting at a byte boundary with no real branching
	 * other than immediate false-vs-(node|true), compress that subtree down to a single
	 * byte-matching node.
	 * The $maxCompStart check elides recursing the final 7 levels of depth (family-dependent)
	 */
	private static function recCompress( &$node, $curBit, $maxCompStart ) {
		if ( !( $curBit & 7 ) ) { // byte boundary, check for depth-8 single path(s)
			$byte = 0;
			$cnode =& $node;
			$i = 8;
			while ( $i-- ) {
				if ( $cnode[0] === false ) {
					$byte |= 1 << $i;
					$cnode =& $cnode[1];
				} elseif ( $cnode[1] === false ) {
					$cnode =& $cnode[0];
				} else {
					// partial-byte branching, give up
					break;
				}
			}
			if ( $i == -1 ) { // means we did not exit the while() via break
				$node = array(
					'comp' => $byte,
					'next' => &$cnode,
				);
				$curBit += 8;
				if ( $cnode !== true ) {
					self::recCompress( $cnode, $curBit, $maxCompStart );
				}
				return;
			}
		}

		++$curBit;
		if ( $curBit <= $maxCompStart ) {
			if ( $node[0] !== false && $node[0] !== true ) {
				self::recCompress( $node[0], $curBit, $maxCompStart );
			}
			if ( $node[1] !== false && $node[1] !== true ) {
				self::recCompress( $node[1], $curBit, $maxCompStart );
			}
		}
	}
}