[ Index ] |
PHP Cross Reference of YOURLS |
[Summary view] [Print] [Text view]
1 <?php 2 3 /* 4 * This file is part of the Symfony package. 5 * 6 * (c) Fabien Potencier <[email protected]> and Trevor Rowbotham <[email protected]> 7 * 8 * For the full copyright and license information, please view the LICENSE 9 * file that was distributed with this source code. 10 */ 11 12 namespace Symfony\Polyfill\Intl\Idn; 13 14 use Symfony\Polyfill\Intl\Idn\Resources\unidata\DisallowedRanges; 15 use Symfony\Polyfill\Intl\Idn\Resources\unidata\Regex; 16 17 /** 18 * @see https://www.unicode.org/reports/tr46/ 19 * 20 * @internal 21 */ 22 final class Idn 23 { 24 public const ERROR_EMPTY_LABEL = 1; 25 public const ERROR_LABEL_TOO_LONG = 2; 26 public const ERROR_DOMAIN_NAME_TOO_LONG = 4; 27 public const ERROR_LEADING_HYPHEN = 8; 28 public const ERROR_TRAILING_HYPHEN = 0x10; 29 public const ERROR_HYPHEN_3_4 = 0x20; 30 public const ERROR_LEADING_COMBINING_MARK = 0x40; 31 public const ERROR_DISALLOWED = 0x80; 32 public const ERROR_PUNYCODE = 0x100; 33 public const ERROR_LABEL_HAS_DOT = 0x200; 34 public const ERROR_INVALID_ACE_LABEL = 0x400; 35 public const ERROR_BIDI = 0x800; 36 public const ERROR_CONTEXTJ = 0x1000; 37 public const ERROR_CONTEXTO_PUNCTUATION = 0x2000; 38 public const ERROR_CONTEXTO_DIGITS = 0x4000; 39 40 public const INTL_IDNA_VARIANT_2003 = 0; 41 public const INTL_IDNA_VARIANT_UTS46 = 1; 42 43 public const IDNA_DEFAULT = 0; 44 public const IDNA_ALLOW_UNASSIGNED = 1; 45 public const IDNA_USE_STD3_RULES = 2; 46 public const IDNA_CHECK_BIDI = 4; 47 public const IDNA_CHECK_CONTEXTJ = 8; 48 public const IDNA_NONTRANSITIONAL_TO_ASCII = 16; 49 public const IDNA_NONTRANSITIONAL_TO_UNICODE = 32; 50 51 public const MAX_DOMAIN_SIZE = 253; 52 public const MAX_LABEL_SIZE = 63; 53 54 public const BASE = 36; 55 public const TMIN = 1; 56 public const TMAX = 26; 57 public const SKEW = 38; 58 public const DAMP = 700; 59 public const INITIAL_BIAS = 72; 60 public const INITIAL_N = 128; 61 public const DELIMITER = '-'; 62 public const MAX_INT = 2147483647; 63 64 /** 65 * Contains the numeric value of a basic code point (for use in representing integers) in the 66 * range 0 to BASE-1, or -1 if b is does not represent a value. 67 * 68 * @var array<int, int> 69 */ 70 private static $basicToDigit = [ 71 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 72 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 73 74 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 75 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1, 76 77 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 78 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, 79 80 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 81 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, 82 83 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 84 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 85 86 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 87 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 88 89 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 90 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 91 92 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 93 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 94 ]; 95 96 /** 97 * @var array<int, int> 98 */ 99 private static $virama; 100 101 /** 102 * @var array<int, string> 103 */ 104 private static $mapped; 105 106 /** 107 * @var array<int, bool> 108 */ 109 private static $ignored; 110 111 /** 112 * @var array<int, string> 113 */ 114 private static $deviation; 115 116 /** 117 * @var array<int, bool> 118 */ 119 private static $disallowed; 120 121 /** 122 * @var array<int, string> 123 */ 124 private static $disallowed_STD3_mapped; 125 126 /** 127 * @var array<int, bool> 128 */ 129 private static $disallowed_STD3_valid; 130 131 /** 132 * @var bool 133 */ 134 private static $mappingTableLoaded = false; 135 136 /** 137 * @see https://www.unicode.org/reports/tr46/#ToASCII 138 * 139 * @param string $domainName 140 * @param int $options 141 * @param int $variant 142 * @param array $idna_info 143 * 144 * @return string|false 145 */ 146 public static function idn_to_ascii($domainName, $options = self::IDNA_DEFAULT, $variant = self::INTL_IDNA_VARIANT_UTS46, &$idna_info = []) 147 { 148 if (self::INTL_IDNA_VARIANT_2003 === $variant) { 149 @trigger_error('idn_to_ascii(): INTL_IDNA_VARIANT_2003 is deprecated', \E_USER_DEPRECATED); 150 } 151 152 $options = [ 153 'CheckHyphens' => true, 154 'CheckBidi' => self::INTL_IDNA_VARIANT_2003 === $variant || 0 !== ($options & self::IDNA_CHECK_BIDI), 155 'CheckJoiners' => self::INTL_IDNA_VARIANT_UTS46 === $variant && 0 !== ($options & self::IDNA_CHECK_CONTEXTJ), 156 'UseSTD3ASCIIRules' => 0 !== ($options & self::IDNA_USE_STD3_RULES), 157 'Transitional_Processing' => self::INTL_IDNA_VARIANT_2003 === $variant || 0 === ($options & self::IDNA_NONTRANSITIONAL_TO_ASCII), 158 'VerifyDnsLength' => true, 159 ]; 160 $info = new Info(); 161 $labels = self::process((string) $domainName, $options, $info); 162 163 foreach ($labels as $i => $label) { 164 // Only convert labels to punycode that contain non-ASCII code points 165 if (1 === preg_match('/[^\x00-\x7F]/', $label)) { 166 try { 167 $label = 'xn--'.self::punycodeEncode($label); 168 } catch (\Exception $e) { 169 $info->errors |= self::ERROR_PUNYCODE; 170 } 171 172 $labels[$i] = $label; 173 } 174 } 175 176 if ($options['VerifyDnsLength']) { 177 self::validateDomainAndLabelLength($labels, $info); 178 } 179 180 $idna_info = [ 181 'result' => implode('.', $labels), 182 'isTransitionalDifferent' => $info->transitionalDifferent, 183 'errors' => $info->errors, 184 ]; 185 186 return 0 === $info->errors ? $idna_info['result'] : false; 187 } 188 189 /** 190 * @see https://www.unicode.org/reports/tr46/#ToUnicode 191 * 192 * @param string $domainName 193 * @param int $options 194 * @param int $variant 195 * @param array $idna_info 196 * 197 * @return string|false 198 */ 199 public static function idn_to_utf8($domainName, $options = self::IDNA_DEFAULT, $variant = self::INTL_IDNA_VARIANT_UTS46, &$idna_info = []) 200 { 201 if (self::INTL_IDNA_VARIANT_2003 === $variant) { 202 @trigger_error('idn_to_utf8(): INTL_IDNA_VARIANT_2003 is deprecated', \E_USER_DEPRECATED); 203 } 204 205 $info = new Info(); 206 $labels = self::process((string) $domainName, [ 207 'CheckHyphens' => true, 208 'CheckBidi' => self::INTL_IDNA_VARIANT_2003 === $variant || 0 !== ($options & self::IDNA_CHECK_BIDI), 209 'CheckJoiners' => self::INTL_IDNA_VARIANT_UTS46 === $variant && 0 !== ($options & self::IDNA_CHECK_CONTEXTJ), 210 'UseSTD3ASCIIRules' => 0 !== ($options & self::IDNA_USE_STD3_RULES), 211 'Transitional_Processing' => self::INTL_IDNA_VARIANT_2003 === $variant || 0 === ($options & self::IDNA_NONTRANSITIONAL_TO_UNICODE), 212 ], $info); 213 $idna_info = [ 214 'result' => implode('.', $labels), 215 'isTransitionalDifferent' => $info->transitionalDifferent, 216 'errors' => $info->errors, 217 ]; 218 219 return 0 === $info->errors ? $idna_info['result'] : false; 220 } 221 222 /** 223 * @param string $label 224 * 225 * @return bool 226 */ 227 private static function isValidContextJ(array $codePoints, $label) 228 { 229 if (!isset(self::$virama)) { 230 self::$virama = require __DIR__.\DIRECTORY_SEPARATOR.'Resources'.\DIRECTORY_SEPARATOR.'unidata'.\DIRECTORY_SEPARATOR.'virama.php'; 231 } 232 233 $offset = 0; 234 235 foreach ($codePoints as $i => $codePoint) { 236 if (0x200C !== $codePoint && 0x200D !== $codePoint) { 237 continue; 238 } 239 240 if (!isset($codePoints[$i - 1])) { 241 return false; 242 } 243 244 // If Canonical_Combining_Class(Before(cp)) .eq. Virama Then True; 245 if (isset(self::$virama[$codePoints[$i - 1]])) { 246 continue; 247 } 248 249 // If RegExpMatch((Joining_Type:{L,D})(Joining_Type:T)*\u200C(Joining_Type:T)*(Joining_Type:{R,D})) Then 250 // True; 251 // Generated RegExp = ([Joining_Type:{L,D}][Joining_Type:T]*\u200C[Joining_Type:T]*)[Joining_Type:{R,D}] 252 if (0x200C === $codePoint && 1 === preg_match(Regex::ZWNJ, $label, $matches, \PREG_OFFSET_CAPTURE, $offset)) { 253 $offset += \strlen($matches[1][0]); 254 255 continue; 256 } 257 258 return false; 259 } 260 261 return true; 262 } 263 264 /** 265 * @see https://www.unicode.org/reports/tr46/#ProcessingStepMap 266 * 267 * @param string $input 268 * @param array<string, bool> $options 269 * 270 * @return string 271 */ 272 private static function mapCodePoints($input, array $options, Info $info) 273 { 274 $str = ''; 275 $useSTD3ASCIIRules = $options['UseSTD3ASCIIRules']; 276 $transitional = $options['Transitional_Processing']; 277 278 foreach (self::utf8Decode($input) as $codePoint) { 279 $data = self::lookupCodePointStatus($codePoint, $useSTD3ASCIIRules); 280 281 switch ($data['status']) { 282 case 'disallowed': 283 case 'valid': 284 $str .= mb_chr($codePoint, 'utf-8'); 285 286 break; 287 288 case 'ignored': 289 // Do nothing. 290 break; 291 292 case 'mapped': 293 $str .= $transitional && 0x1E9E === $codePoint ? 'ss' : $data['mapping']; 294 295 break; 296 297 case 'deviation': 298 $info->transitionalDifferent = true; 299 $str .= ($transitional ? $data['mapping'] : mb_chr($codePoint, 'utf-8')); 300 301 break; 302 } 303 } 304 305 return $str; 306 } 307 308 /** 309 * @see https://www.unicode.org/reports/tr46/#Processing 310 * 311 * @param string $domain 312 * @param array<string, bool> $options 313 * 314 * @return array<int, string> 315 */ 316 private static function process($domain, array $options, Info $info) 317 { 318 // If VerifyDnsLength is not set, we are doing ToUnicode otherwise we are doing ToASCII and 319 // we need to respect the VerifyDnsLength option. 320 $checkForEmptyLabels = !isset($options['VerifyDnsLength']) || $options['VerifyDnsLength']; 321 322 if ($checkForEmptyLabels && '' === $domain) { 323 $info->errors |= self::ERROR_EMPTY_LABEL; 324 325 return [$domain]; 326 } 327 328 // Step 1. Map each code point in the domain name string 329 $domain = self::mapCodePoints($domain, $options, $info); 330 331 // Step 2. Normalize the domain name string to Unicode Normalization Form C. 332 if (!\Normalizer::isNormalized($domain, \Normalizer::FORM_C)) { 333 $domain = \Normalizer::normalize($domain, \Normalizer::FORM_C); 334 } 335 336 // Step 3. Break the string into labels at U+002E (.) FULL STOP. 337 $labels = explode('.', $domain); 338 $lastLabelIndex = \count($labels) - 1; 339 340 // Step 4. Convert and validate each label in the domain name string. 341 foreach ($labels as $i => $label) { 342 $validationOptions = $options; 343 344 if ('xn--' === substr($label, 0, 4)) { 345 // Step 4.1. If the label contains any non-ASCII code point (i.e., a code point greater than U+007F), 346 // record that there was an error, and continue with the next label. 347 if (preg_match('/[^\x00-\x7F]/', $label)) { 348 $info->errors |= self::ERROR_PUNYCODE; 349 350 continue; 351 } 352 353 // Step 4.2. Attempt to convert the rest of the label to Unicode according to Punycode [RFC3492]. If 354 // that conversion fails, record that there was an error, and continue 355 // with the next label. Otherwise replace the original label in the string by the results of the 356 // conversion. 357 try { 358 $label = self::punycodeDecode(substr($label, 4)); 359 } catch (\Exception $e) { 360 $info->errors |= self::ERROR_PUNYCODE; 361 362 continue; 363 } 364 365 $validationOptions['Transitional_Processing'] = false; 366 $labels[$i] = $label; 367 } 368 369 self::validateLabel($label, $info, $validationOptions, $i > 0 && $i === $lastLabelIndex); 370 } 371 372 if ($info->bidiDomain && !$info->validBidiDomain) { 373 $info->errors |= self::ERROR_BIDI; 374 } 375 376 // Any input domain name string that does not record an error has been successfully 377 // processed according to this specification. Conversely, if an input domain_name string 378 // causes an error, then the processing of the input domain_name string fails. Determining 379 // what to do with error input is up to the caller, and not in the scope of this document. 380 return $labels; 381 } 382 383 /** 384 * @see https://tools.ietf.org/html/rfc5893#section-2 385 * 386 * @param string $label 387 */ 388 private static function validateBidiLabel($label, Info $info) 389 { 390 if (1 === preg_match(Regex::RTL_LABEL, $label)) { 391 $info->bidiDomain = true; 392 393 // Step 1. The first character must be a character with Bidi property L, R, or AL. 394 // If it has the R or AL property, it is an RTL label 395 if (1 !== preg_match(Regex::BIDI_STEP_1_RTL, $label)) { 396 $info->validBidiDomain = false; 397 398 return; 399 } 400 401 // Step 2. In an RTL label, only characters with the Bidi properties R, AL, AN, EN, ES, 402 // CS, ET, ON, BN, or NSM are allowed. 403 if (1 === preg_match(Regex::BIDI_STEP_2, $label)) { 404 $info->validBidiDomain = false; 405 406 return; 407 } 408 409 // Step 3. In an RTL label, the end of the label must be a character with Bidi property 410 // R, AL, EN, or AN, followed by zero or more characters with Bidi property NSM. 411 if (1 !== preg_match(Regex::BIDI_STEP_3, $label)) { 412 $info->validBidiDomain = false; 413 414 return; 415 } 416 417 // Step 4. In an RTL label, if an EN is present, no AN may be present, and vice versa. 418 if (1 === preg_match(Regex::BIDI_STEP_4_AN, $label) && 1 === preg_match(Regex::BIDI_STEP_4_EN, $label)) { 419 $info->validBidiDomain = false; 420 421 return; 422 } 423 424 return; 425 } 426 427 // We are a LTR label 428 // Step 1. The first character must be a character with Bidi property L, R, or AL. 429 // If it has the L property, it is an LTR label. 430 if (1 !== preg_match(Regex::BIDI_STEP_1_LTR, $label)) { 431 $info->validBidiDomain = false; 432 433 return; 434 } 435 436 // Step 5. In an LTR label, only characters with the Bidi properties L, EN, 437 // ES, CS, ET, ON, BN, or NSM are allowed. 438 if (1 === preg_match(Regex::BIDI_STEP_5, $label)) { 439 $info->validBidiDomain = false; 440 441 return; 442 } 443 444 // Step 6.In an LTR label, the end of the label must be a character with Bidi property L or 445 // EN, followed by zero or more characters with Bidi property NSM. 446 if (1 !== preg_match(Regex::BIDI_STEP_6, $label)) { 447 $info->validBidiDomain = false; 448 449 return; 450 } 451 } 452 453 /** 454 * @param array<int, string> $labels 455 */ 456 private static function validateDomainAndLabelLength(array $labels, Info $info) 457 { 458 $maxDomainSize = self::MAX_DOMAIN_SIZE; 459 $length = \count($labels); 460 461 // Number of "." delimiters. 462 $domainLength = $length - 1; 463 464 // If the last label is empty and it is not the first label, then it is the root label. 465 // Increase the max size by 1, making it 254, to account for the root label's "." 466 // delimiter. This also means we don't need to check the last label's length for being too 467 // long. 468 if ($length > 1 && '' === $labels[$length - 1]) { 469 ++$maxDomainSize; 470 --$length; 471 } 472 473 for ($i = 0; $i < $length; ++$i) { 474 $bytes = \strlen($labels[$i]); 475 $domainLength += $bytes; 476 477 if ($bytes > self::MAX_LABEL_SIZE) { 478 $info->errors |= self::ERROR_LABEL_TOO_LONG; 479 } 480 } 481 482 if ($domainLength > $maxDomainSize) { 483 $info->errors |= self::ERROR_DOMAIN_NAME_TOO_LONG; 484 } 485 } 486 487 /** 488 * @see https://www.unicode.org/reports/tr46/#Validity_Criteria 489 * 490 * @param string $label 491 * @param array<string, bool> $options 492 * @param bool $canBeEmpty 493 */ 494 private static function validateLabel($label, Info $info, array $options, $canBeEmpty) 495 { 496 if ('' === $label) { 497 if (!$canBeEmpty && (!isset($options['VerifyDnsLength']) || $options['VerifyDnsLength'])) { 498 $info->errors |= self::ERROR_EMPTY_LABEL; 499 } 500 501 return; 502 } 503 504 // Step 1. The label must be in Unicode Normalization Form C. 505 if (!\Normalizer::isNormalized($label, \Normalizer::FORM_C)) { 506 $info->errors |= self::ERROR_INVALID_ACE_LABEL; 507 } 508 509 $codePoints = self::utf8Decode($label); 510 511 if ($options['CheckHyphens']) { 512 // Step 2. If CheckHyphens, the label must not contain a U+002D HYPHEN-MINUS character 513 // in both the thrid and fourth positions. 514 if (isset($codePoints[2], $codePoints[3]) && 0x002D === $codePoints[2] && 0x002D === $codePoints[3]) { 515 $info->errors |= self::ERROR_HYPHEN_3_4; 516 } 517 518 // Step 3. If CheckHyphens, the label must neither begin nor end with a U+002D 519 // HYPHEN-MINUS character. 520 if ('-' === substr($label, 0, 1)) { 521 $info->errors |= self::ERROR_LEADING_HYPHEN; 522 } 523 524 if ('-' === substr($label, -1, 1)) { 525 $info->errors |= self::ERROR_TRAILING_HYPHEN; 526 } 527 } elseif ('xn--' === substr($label, 0, 4)) { 528 $info->errors |= self::ERROR_PUNYCODE; 529 } 530 531 // Step 4. The label must not contain a U+002E (.) FULL STOP. 532 if (false !== strpos($label, '.')) { 533 $info->errors |= self::ERROR_LABEL_HAS_DOT; 534 } 535 536 // Step 5. The label must not begin with a combining mark, that is: General_Category=Mark. 537 if (1 === preg_match(Regex::COMBINING_MARK, $label)) { 538 $info->errors |= self::ERROR_LEADING_COMBINING_MARK; 539 } 540 541 // Step 6. Each code point in the label must only have certain status values according to 542 // Section 5, IDNA Mapping Table: 543 $transitional = $options['Transitional_Processing']; 544 $useSTD3ASCIIRules = $options['UseSTD3ASCIIRules']; 545 546 foreach ($codePoints as $codePoint) { 547 $data = self::lookupCodePointStatus($codePoint, $useSTD3ASCIIRules); 548 $status = $data['status']; 549 550 if ('valid' === $status || (!$transitional && 'deviation' === $status)) { 551 continue; 552 } 553 554 $info->errors |= self::ERROR_DISALLOWED; 555 556 break; 557 } 558 559 // Step 7. If CheckJoiners, the label must satisify the ContextJ rules from Appendix A, in 560 // The Unicode Code Points and Internationalized Domain Names for Applications (IDNA) 561 // [IDNA2008]. 562 if ($options['CheckJoiners'] && !self::isValidContextJ($codePoints, $label)) { 563 $info->errors |= self::ERROR_CONTEXTJ; 564 } 565 566 // Step 8. If CheckBidi, and if the domain name is a Bidi domain name, then the label must 567 // satisfy all six of the numbered conditions in [IDNA2008] RFC 5893, Section 2. 568 if ($options['CheckBidi'] && (!$info->bidiDomain || $info->validBidiDomain)) { 569 self::validateBidiLabel($label, $info); 570 } 571 } 572 573 /** 574 * @see https://tools.ietf.org/html/rfc3492#section-6.2 575 * 576 * @param string $input 577 * 578 * @return string 579 */ 580 private static function punycodeDecode($input) 581 { 582 $n = self::INITIAL_N; 583 $out = 0; 584 $i = 0; 585 $bias = self::INITIAL_BIAS; 586 $lastDelimIndex = strrpos($input, self::DELIMITER); 587 $b = false === $lastDelimIndex ? 0 : $lastDelimIndex; 588 $inputLength = \strlen($input); 589 $output = []; 590 $bytes = array_map('ord', str_split($input)); 591 592 for ($j = 0; $j < $b; ++$j) { 593 if ($bytes[$j] > 0x7F) { 594 throw new \Exception('Invalid input'); 595 } 596 597 $output[$out++] = $input[$j]; 598 } 599 600 if ($b > 0) { 601 ++$b; 602 } 603 604 for ($in = $b; $in < $inputLength; ++$out) { 605 $oldi = $i; 606 $w = 1; 607 608 for ($k = self::BASE; /* no condition */; $k += self::BASE) { 609 if ($in >= $inputLength) { 610 throw new \Exception('Invalid input'); 611 } 612 613 $digit = self::$basicToDigit[$bytes[$in++] & 0xFF]; 614 615 if ($digit < 0) { 616 throw new \Exception('Invalid input'); 617 } 618 619 if ($digit > intdiv(self::MAX_INT - $i, $w)) { 620 throw new \Exception('Integer overflow'); 621 } 622 623 $i += $digit * $w; 624 625 if ($k <= $bias) { 626 $t = self::TMIN; 627 } elseif ($k >= $bias + self::TMAX) { 628 $t = self::TMAX; 629 } else { 630 $t = $k - $bias; 631 } 632 633 if ($digit < $t) { 634 break; 635 } 636 637 $baseMinusT = self::BASE - $t; 638 639 if ($w > intdiv(self::MAX_INT, $baseMinusT)) { 640 throw new \Exception('Integer overflow'); 641 } 642 643 $w *= $baseMinusT; 644 } 645 646 $outPlusOne = $out + 1; 647 $bias = self::adaptBias($i - $oldi, $outPlusOne, 0 === $oldi); 648 649 if (intdiv($i, $outPlusOne) > self::MAX_INT - $n) { 650 throw new \Exception('Integer overflow'); 651 } 652 653 $n += intdiv($i, $outPlusOne); 654 $i %= $outPlusOne; 655 array_splice($output, $i++, 0, [mb_chr($n, 'utf-8')]); 656 } 657 658 return implode('', $output); 659 } 660 661 /** 662 * @see https://tools.ietf.org/html/rfc3492#section-6.3 663 * 664 * @param string $input 665 * 666 * @return string 667 */ 668 private static function punycodeEncode($input) 669 { 670 $n = self::INITIAL_N; 671 $delta = 0; 672 $out = 0; 673 $bias = self::INITIAL_BIAS; 674 $inputLength = 0; 675 $output = ''; 676 $iter = self::utf8Decode($input); 677 678 foreach ($iter as $codePoint) { 679 ++$inputLength; 680 681 if ($codePoint < 0x80) { 682 $output .= \chr($codePoint); 683 ++$out; 684 } 685 } 686 687 $h = $out; 688 $b = $out; 689 690 if ($b > 0) { 691 $output .= self::DELIMITER; 692 ++$out; 693 } 694 695 while ($h < $inputLength) { 696 $m = self::MAX_INT; 697 698 foreach ($iter as $codePoint) { 699 if ($codePoint >= $n && $codePoint < $m) { 700 $m = $codePoint; 701 } 702 } 703 704 if ($m - $n > intdiv(self::MAX_INT - $delta, $h + 1)) { 705 throw new \Exception('Integer overflow'); 706 } 707 708 $delta += ($m - $n) * ($h + 1); 709 $n = $m; 710 711 foreach ($iter as $codePoint) { 712 if ($codePoint < $n && 0 === ++$delta) { 713 throw new \Exception('Integer overflow'); 714 } 715 716 if ($codePoint === $n) { 717 $q = $delta; 718 719 for ($k = self::BASE; /* no condition */; $k += self::BASE) { 720 if ($k <= $bias) { 721 $t = self::TMIN; 722 } elseif ($k >= $bias + self::TMAX) { 723 $t = self::TMAX; 724 } else { 725 $t = $k - $bias; 726 } 727 728 if ($q < $t) { 729 break; 730 } 731 732 $qMinusT = $q - $t; 733 $baseMinusT = self::BASE - $t; 734 $output .= self::encodeDigit($t + $qMinusT % $baseMinusT, false); 735 ++$out; 736 $q = intdiv($qMinusT, $baseMinusT); 737 } 738 739 $output .= self::encodeDigit($q, false); 740 ++$out; 741 $bias = self::adaptBias($delta, $h + 1, $h === $b); 742 $delta = 0; 743 ++$h; 744 } 745 } 746 747 ++$delta; 748 ++$n; 749 } 750 751 return $output; 752 } 753 754 /** 755 * @see https://tools.ietf.org/html/rfc3492#section-6.1 756 * 757 * @param int $delta 758 * @param int $numPoints 759 * @param bool $firstTime 760 * 761 * @return int 762 */ 763 private static function adaptBias($delta, $numPoints, $firstTime) 764 { 765 // xxx >> 1 is a faster way of doing intdiv(xxx, 2) 766 $delta = $firstTime ? intdiv($delta, self::DAMP) : $delta >> 1; 767 $delta += intdiv($delta, $numPoints); 768 $k = 0; 769 770 while ($delta > ((self::BASE - self::TMIN) * self::TMAX) >> 1) { 771 $delta = intdiv($delta, self::BASE - self::TMIN); 772 $k += self::BASE; 773 } 774 775 return $k + intdiv((self::BASE - self::TMIN + 1) * $delta, $delta + self::SKEW); 776 } 777 778 /** 779 * @param int $d 780 * @param bool $flag 781 * 782 * @return string 783 */ 784 private static function encodeDigit($d, $flag) 785 { 786 return \chr($d + 22 + 75 * ($d < 26 ? 1 : 0) - (($flag ? 1 : 0) << 5)); 787 } 788 789 /** 790 * Takes a UTF-8 encoded string and converts it into a series of integer code points. Any 791 * invalid byte sequences will be replaced by a U+FFFD replacement code point. 792 * 793 * @see https://encoding.spec.whatwg.org/#utf-8-decoder 794 * 795 * @param string $input 796 * 797 * @return array<int, int> 798 */ 799 private static function utf8Decode($input) 800 { 801 $bytesSeen = 0; 802 $bytesNeeded = 0; 803 $lowerBoundary = 0x80; 804 $upperBoundary = 0xBF; 805 $codePoint = 0; 806 $codePoints = []; 807 $length = \strlen($input); 808 809 for ($i = 0; $i < $length; ++$i) { 810 $byte = \ord($input[$i]); 811 812 if (0 === $bytesNeeded) { 813 if ($byte >= 0x00 && $byte <= 0x7F) { 814 $codePoints[] = $byte; 815 816 continue; 817 } 818 819 if ($byte >= 0xC2 && $byte <= 0xDF) { 820 $bytesNeeded = 1; 821 $codePoint = $byte & 0x1F; 822 } elseif ($byte >= 0xE0 && $byte <= 0xEF) { 823 if (0xE0 === $byte) { 824 $lowerBoundary = 0xA0; 825 } elseif (0xED === $byte) { 826 $upperBoundary = 0x9F; 827 } 828 829 $bytesNeeded = 2; 830 $codePoint = $byte & 0xF; 831 } elseif ($byte >= 0xF0 && $byte <= 0xF4) { 832 if (0xF0 === $byte) { 833 $lowerBoundary = 0x90; 834 } elseif (0xF4 === $byte) { 835 $upperBoundary = 0x8F; 836 } 837 838 $bytesNeeded = 3; 839 $codePoint = $byte & 0x7; 840 } else { 841 $codePoints[] = 0xFFFD; 842 } 843 844 continue; 845 } 846 847 if ($byte < $lowerBoundary || $byte > $upperBoundary) { 848 $codePoint = 0; 849 $bytesNeeded = 0; 850 $bytesSeen = 0; 851 $lowerBoundary = 0x80; 852 $upperBoundary = 0xBF; 853 --$i; 854 $codePoints[] = 0xFFFD; 855 856 continue; 857 } 858 859 $lowerBoundary = 0x80; 860 $upperBoundary = 0xBF; 861 $codePoint = ($codePoint << 6) | ($byte & 0x3F); 862 863 if (++$bytesSeen !== $bytesNeeded) { 864 continue; 865 } 866 867 $codePoints[] = $codePoint; 868 $codePoint = 0; 869 $bytesNeeded = 0; 870 $bytesSeen = 0; 871 } 872 873 // String unexpectedly ended, so append a U+FFFD code point. 874 if (0 !== $bytesNeeded) { 875 $codePoints[] = 0xFFFD; 876 } 877 878 return $codePoints; 879 } 880 881 /** 882 * @param int $codePoint 883 * @param bool $useSTD3ASCIIRules 884 * 885 * @return array{status: string, mapping?: string} 886 */ 887 private static function lookupCodePointStatus($codePoint, $useSTD3ASCIIRules) 888 { 889 if (!self::$mappingTableLoaded) { 890 self::$mappingTableLoaded = true; 891 self::$mapped = require __DIR__.'/Resources/unidata/mapped.php'; 892 self::$ignored = require __DIR__.'/Resources/unidata/ignored.php'; 893 self::$deviation = require __DIR__.'/Resources/unidata/deviation.php'; 894 self::$disallowed = require __DIR__.'/Resources/unidata/disallowed.php'; 895 self::$disallowed_STD3_mapped = require __DIR__.'/Resources/unidata/disallowed_STD3_mapped.php'; 896 self::$disallowed_STD3_valid = require __DIR__.'/Resources/unidata/disallowed_STD3_valid.php'; 897 } 898 899 if (isset(self::$mapped[$codePoint])) { 900 return ['status' => 'mapped', 'mapping' => self::$mapped[$codePoint]]; 901 } 902 903 if (isset(self::$ignored[$codePoint])) { 904 return ['status' => 'ignored']; 905 } 906 907 if (isset(self::$deviation[$codePoint])) { 908 return ['status' => 'deviation', 'mapping' => self::$deviation[$codePoint]]; 909 } 910 911 if (isset(self::$disallowed[$codePoint]) || DisallowedRanges::inRange($codePoint)) { 912 return ['status' => 'disallowed']; 913 } 914 915 $isDisallowedMapped = isset(self::$disallowed_STD3_mapped[$codePoint]); 916 917 if ($isDisallowedMapped || isset(self::$disallowed_STD3_valid[$codePoint])) { 918 $status = 'disallowed'; 919 920 if (!$useSTD3ASCIIRules) { 921 $status = $isDisallowedMapped ? 'mapped' : 'valid'; 922 } 923 924 if ($isDisallowedMapped) { 925 return ['status' => $status, 'mapping' => self::$disallowed_STD3_mapped[$codePoint]]; 926 } 927 928 return ['status' => $status]; 929 } 930 931 return ['status' => 'valid']; 932 } 933 }
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
Generated: Mon Mar 31 05:10:02 2025 | Cross-referenced by PHPXref 0.7.1 |