[ Index ] |
PHP Cross Reference of YOURLS |
[Summary view] [Print] [Text view]
1 <?php 2 3 namespace WpOrg\Requests; 4 5 use WpOrg\Requests\Exception; 6 use WpOrg\Requests\Exception\InvalidArgument; 7 use WpOrg\Requests\Utility\InputValidator; 8 9 /** 10 * IDNA URL encoder 11 * 12 * Note: Not fully compliant, as nameprep does nothing yet. 13 * 14 * @package Requests\Utilities 15 * 16 * @link https://tools.ietf.org/html/rfc3490 IDNA specification 17 * @link https://tools.ietf.org/html/rfc3492 Punycode/Bootstrap specification 18 */ 19 class IdnaEncoder { 20 /** 21 * ACE prefix used for IDNA 22 * 23 * @link https://tools.ietf.org/html/rfc3490#section-5 24 * @var string 25 */ 26 const ACE_PREFIX = 'xn--'; 27 28 /** 29 * Maximum length of a IDNA URL in ASCII. 30 * 31 * @see \WpOrg\Requests\IdnaEncoder::to_ascii() 32 * 33 * @since 2.0.0 34 * 35 * @var int 36 */ 37 const MAX_LENGTH = 64; 38 39 /**#@+ 40 * Bootstrap constant for Punycode 41 * 42 * @link https://tools.ietf.org/html/rfc3492#section-5 43 * @var int 44 */ 45 const BOOTSTRAP_BASE = 36; 46 const BOOTSTRAP_TMIN = 1; 47 const BOOTSTRAP_TMAX = 26; 48 const BOOTSTRAP_SKEW = 38; 49 const BOOTSTRAP_DAMP = 700; 50 const BOOTSTRAP_INITIAL_BIAS = 72; 51 const BOOTSTRAP_INITIAL_N = 128; 52 /**#@-*/ 53 54 /** 55 * Encode a hostname using Punycode 56 * 57 * @param string|Stringable $hostname Hostname 58 * @return string Punycode-encoded hostname 59 * @throws \WpOrg\Requests\Exception\InvalidArgument When the passed argument is not a string or a stringable object. 60 */ 61 public static function encode($hostname) { 62 if (InputValidator::is_string_or_stringable($hostname) === false) { 63 throw InvalidArgument::create(1, '$hostname', 'string|Stringable', gettype($hostname)); 64 } 65 66 $parts = explode('.', $hostname); 67 foreach ($parts as &$part) { 68 $part = self::to_ascii($part); 69 } 70 71 return implode('.', $parts); 72 } 73 74 /** 75 * Convert a UTF-8 text string to an ASCII string using Punycode 76 * 77 * @param string $text ASCII or UTF-8 string (max length 64 characters) 78 * @return string ASCII string 79 * 80 * @throws \WpOrg\Requests\Exception Provided string longer than 64 ASCII characters (`idna.provided_too_long`) 81 * @throws \WpOrg\Requests\Exception Prepared string longer than 64 ASCII characters (`idna.prepared_too_long`) 82 * @throws \WpOrg\Requests\Exception Provided string already begins with xn-- (`idna.provided_is_prefixed`) 83 * @throws \WpOrg\Requests\Exception Encoded string longer than 64 ASCII characters (`idna.encoded_too_long`) 84 */ 85 public static function to_ascii($text) { 86 // Step 1: Check if the text is already ASCII 87 if (self::is_ascii($text)) { 88 // Skip to step 7 89 if (strlen($text) < self::MAX_LENGTH) { 90 return $text; 91 } 92 93 throw new Exception('Provided string is too long', 'idna.provided_too_long', $text); 94 } 95 96 // Step 2: nameprep 97 $text = self::nameprep($text); 98 99 // Step 3: UseSTD3ASCIIRules is false, continue 100 // Step 4: Check if it's ASCII now 101 if (self::is_ascii($text)) { 102 // Skip to step 7 103 /* 104 * As the `nameprep()` method returns the original string, this code will never be reached until 105 * that method is properly implemented. 106 */ 107 // @codeCoverageIgnoreStart 108 if (strlen($text) < self::MAX_LENGTH) { 109 return $text; 110 } 111 112 throw new Exception('Prepared string is too long', 'idna.prepared_too_long', $text); 113 // @codeCoverageIgnoreEnd 114 } 115 116 // Step 5: Check ACE prefix 117 if (strpos($text, self::ACE_PREFIX) === 0) { 118 throw new Exception('Provided string begins with ACE prefix', 'idna.provided_is_prefixed', $text); 119 } 120 121 // Step 6: Encode with Punycode 122 $text = self::punycode_encode($text); 123 124 // Step 7: Prepend ACE prefix 125 $text = self::ACE_PREFIX . $text; 126 127 // Step 8: Check size 128 if (strlen($text) < self::MAX_LENGTH) { 129 return $text; 130 } 131 132 throw new Exception('Encoded string is too long', 'idna.encoded_too_long', $text); 133 } 134 135 /** 136 * Check whether a given text string contains only ASCII characters 137 * 138 * @internal (Testing found regex was the fastest implementation) 139 * 140 * @param string $text 141 * @return bool Is the text string ASCII-only? 142 */ 143 protected static function is_ascii($text) { 144 return (preg_match('/(?:[^\x00-\x7F])/', $text) !== 1); 145 } 146 147 /** 148 * Prepare a text string for use as an IDNA name 149 * 150 * @todo Implement this based on RFC 3491 and the newer 5891 151 * @param string $text 152 * @return string Prepared string 153 */ 154 protected static function nameprep($text) { 155 return $text; 156 } 157 158 /** 159 * Convert a UTF-8 string to a UCS-4 codepoint array 160 * 161 * Based on \WpOrg\Requests\Iri::replace_invalid_with_pct_encoding() 162 * 163 * @param string $input 164 * @return array Unicode code points 165 * 166 * @throws \WpOrg\Requests\Exception Invalid UTF-8 codepoint (`idna.invalidcodepoint`) 167 */ 168 protected static function utf8_to_codepoints($input) { 169 $codepoints = []; 170 171 // Get number of bytes 172 $strlen = strlen($input); 173 174 // phpcs:ignore Generic.CodeAnalysis.JumbledIncrementer -- This is a deliberate choice. 175 for ($position = 0; $position < $strlen; $position++) { 176 $value = ord($input[$position]); 177 178 if ((~$value & 0x80) === 0x80) { // One byte sequence: 179 $character = $value; 180 $length = 1; 181 $remaining = 0; 182 } elseif (($value & 0xE0) === 0xC0) { // Two byte sequence: 183 $character = ($value & 0x1F) << 6; 184 $length = 2; 185 $remaining = 1; 186 } elseif (($value & 0xF0) === 0xE0) { // Three byte sequence: 187 $character = ($value & 0x0F) << 12; 188 $length = 3; 189 $remaining = 2; 190 } elseif (($value & 0xF8) === 0xF0) { // Four byte sequence: 191 $character = ($value & 0x07) << 18; 192 $length = 4; 193 $remaining = 3; 194 } else { // Invalid byte: 195 throw new Exception('Invalid Unicode codepoint', 'idna.invalidcodepoint', $value); 196 } 197 198 if ($remaining > 0) { 199 if ($position + $length > $strlen) { 200 throw new Exception('Invalid Unicode codepoint', 'idna.invalidcodepoint', $character); 201 } 202 203 for ($position++; $remaining > 0; $position++) { 204 $value = ord($input[$position]); 205 206 // If it is invalid, count the sequence as invalid and reprocess the current byte: 207 if (($value & 0xC0) !== 0x80) { 208 throw new Exception('Invalid Unicode codepoint', 'idna.invalidcodepoint', $character); 209 } 210 211 --$remaining; 212 $character |= ($value & 0x3F) << ($remaining * 6); 213 } 214 215 $position--; 216 } 217 218 if (// Non-shortest form sequences are invalid 219 $length > 1 && $character <= 0x7F 220 || $length > 2 && $character <= 0x7FF 221 || $length > 3 && $character <= 0xFFFF 222 // Outside of range of ucschar codepoints 223 // Noncharacters 224 || ($character & 0xFFFE) === 0xFFFE 225 || $character >= 0xFDD0 && $character <= 0xFDEF 226 || ( 227 // Everything else not in ucschar 228 $character > 0xD7FF && $character < 0xF900 229 || $character < 0x20 230 || $character > 0x7E && $character < 0xA0 231 || $character > 0xEFFFD 232 ) 233 ) { 234 throw new Exception('Invalid Unicode codepoint', 'idna.invalidcodepoint', $character); 235 } 236 237 $codepoints[] = $character; 238 } 239 240 return $codepoints; 241 } 242 243 /** 244 * RFC3492-compliant encoder 245 * 246 * @internal Pseudo-code from Section 6.3 is commented with "#" next to relevant code 247 * 248 * @param string $input UTF-8 encoded string to encode 249 * @return string Punycode-encoded string 250 * 251 * @throws \WpOrg\Requests\Exception On character outside of the domain (never happens with Punycode) (`idna.character_outside_domain`) 252 */ 253 public static function punycode_encode($input) { 254 $output = ''; 255 // let n = initial_n 256 $n = self::BOOTSTRAP_INITIAL_N; 257 // let delta = 0 258 $delta = 0; 259 // let bias = initial_bias 260 $bias = self::BOOTSTRAP_INITIAL_BIAS; 261 // let h = b = the number of basic code points in the input 262 $h = 0; 263 $b = 0; // see loop 264 // copy them to the output in order 265 $codepoints = self::utf8_to_codepoints($input); 266 $extended = []; 267 268 foreach ($codepoints as $char) { 269 if ($char < 128) { 270 // Character is valid ASCII 271 // TODO: this should also check if it's valid for a URL 272 $output .= chr($char); 273 $h++; 274 275 // Check if the character is non-ASCII, but below initial n 276 // This never occurs for Punycode, so ignore in coverage 277 // @codeCoverageIgnoreStart 278 } elseif ($char < $n) { 279 throw new Exception('Invalid character', 'idna.character_outside_domain', $char); 280 // @codeCoverageIgnoreEnd 281 } else { 282 $extended[$char] = true; 283 } 284 } 285 286 $extended = array_keys($extended); 287 sort($extended); 288 $b = $h; 289 // [copy them] followed by a delimiter if b > 0 290 if (strlen($output) > 0) { 291 $output .= '-'; 292 } 293 294 // {if the input contains a non-basic code point < n then fail} 295 // while h < length(input) do begin 296 $codepointcount = count($codepoints); 297 while ($h < $codepointcount) { 298 // let m = the minimum code point >= n in the input 299 $m = array_shift($extended); 300 //printf('next code point to insert is %s' . PHP_EOL, dechex($m)); 301 // let delta = delta + (m - n) * (h + 1), fail on overflow 302 $delta += ($m - $n) * ($h + 1); 303 // let n = m 304 $n = $m; 305 // for each code point c in the input (in order) do begin 306 for ($num = 0; $num < $codepointcount; $num++) { 307 $c = $codepoints[$num]; 308 // if c < n then increment delta, fail on overflow 309 if ($c < $n) { 310 $delta++; 311 } elseif ($c === $n) { // if c == n then begin 312 // let q = delta 313 $q = $delta; 314 // for k = base to infinity in steps of base do begin 315 for ($k = self::BOOTSTRAP_BASE; ; $k += self::BOOTSTRAP_BASE) { 316 // let t = tmin if k <= bias {+ tmin}, or 317 // tmax if k >= bias + tmax, or k - bias otherwise 318 if ($k <= ($bias + self::BOOTSTRAP_TMIN)) { 319 $t = self::BOOTSTRAP_TMIN; 320 } elseif ($k >= ($bias + self::BOOTSTRAP_TMAX)) { 321 $t = self::BOOTSTRAP_TMAX; 322 } else { 323 $t = $k - $bias; 324 } 325 326 // if q < t then break 327 if ($q < $t) { 328 break; 329 } 330 331 // output the code point for digit t + ((q - t) mod (base - t)) 332 $digit = $t + (($q - $t) % (self::BOOTSTRAP_BASE - $t)); 333 $output .= self::digit_to_char($digit); 334 // let q = (q - t) div (base - t) 335 $q = floor(($q - $t) / (self::BOOTSTRAP_BASE - $t)); 336 } // end 337 // output the code point for digit q 338 $output .= self::digit_to_char($q); 339 // let bias = adapt(delta, h + 1, test h equals b?) 340 $bias = self::adapt($delta, $h + 1, $h === $b); 341 // let delta = 0 342 $delta = 0; 343 // increment h 344 $h++; 345 } // end 346 } // end 347 // increment delta and n 348 $delta++; 349 $n++; 350 } // end 351 352 return $output; 353 } 354 355 /** 356 * Convert a digit to its respective character 357 * 358 * @link https://tools.ietf.org/html/rfc3492#section-5 359 * 360 * @param int $digit Digit in the range 0-35 361 * @return string Single character corresponding to digit 362 * 363 * @throws \WpOrg\Requests\Exception On invalid digit (`idna.invalid_digit`) 364 */ 365 protected static function digit_to_char($digit) { 366 // @codeCoverageIgnoreStart 367 // As far as I know, this never happens, but still good to be sure. 368 if ($digit < 0 || $digit > 35) { 369 throw new Exception(sprintf('Invalid digit %d', $digit), 'idna.invalid_digit', $digit); 370 } 371 372 // @codeCoverageIgnoreEnd 373 $digits = 'abcdefghijklmnopqrstuvwxyz0123456789'; 374 return substr($digits, $digit, 1); 375 } 376 377 /** 378 * Adapt the bias 379 * 380 * @link https://tools.ietf.org/html/rfc3492#section-6.1 381 * @param int $delta 382 * @param int $numpoints 383 * @param bool $firsttime 384 * @return int New bias 385 * 386 * function adapt(delta,numpoints,firsttime): 387 */ 388 protected static function adapt($delta, $numpoints, $firsttime) { 389 // if firsttime then let delta = delta div damp 390 if ($firsttime) { 391 $delta = floor($delta / self::BOOTSTRAP_DAMP); 392 } else { 393 // else let delta = delta div 2 394 $delta = floor($delta / 2); 395 } 396 397 // let delta = delta + (delta div numpoints) 398 $delta += floor($delta / $numpoints); 399 // let k = 0 400 $k = 0; 401 // while delta > ((base - tmin) * tmax) div 2 do begin 402 $max = floor(((self::BOOTSTRAP_BASE - self::BOOTSTRAP_TMIN) * self::BOOTSTRAP_TMAX) / 2); 403 while ($delta > $max) { 404 // let delta = delta div (base - tmin) 405 $delta = floor($delta / (self::BOOTSTRAP_BASE - self::BOOTSTRAP_TMIN)); 406 // let k = k + base 407 $k += self::BOOTSTRAP_BASE; 408 } // end 409 // return k + (((base - tmin + 1) * delta) div (delta + skew)) 410 return $k + floor(((self::BOOTSTRAP_BASE - self::BOOTSTRAP_TMIN + 1) * $delta) / ($delta + self::BOOTSTRAP_SKEW)); 411 } 412 }
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
Generated: Tue Jan 21 05:10:11 2025 | Cross-referenced by PHPXref 0.7.1 |