friendship ended with social-app. php is my new best friend
at main 24 kB view raw
1<?php 2 3/** 4 * This file is part of the Nette Framework (https://nette.org) 5 * Copyright (c) 2004 David Grudl (https://davidgrudl.com) 6 */ 7 8declare(strict_types=1); 9 10namespace Nette\Utils; 11 12use JetBrains\PhpStorm\Language; 13use Nette; 14use function array_keys, array_map, array_shift, array_values, bin2hex, class_exists, defined, extension_loaded, function_exists, htmlspecialchars, htmlspecialchars_decode, iconv, iconv_strlen, iconv_substr, implode, in_array, is_array, is_callable, is_int, is_object, is_string, key, max, mb_convert_case, mb_strlen, mb_strtolower, mb_strtoupper, mb_substr, pack, preg_last_error, preg_last_error_msg, preg_quote, preg_replace, str_contains, str_ends_with, str_repeat, str_replace, str_starts_with, strlen, strpos, strrev, strrpos, strtolower, strtoupper, strtr, substr, trim, unpack, utf8_decode; 15use const ENT_IGNORE, ENT_NOQUOTES, ICONV_IMPL, MB_CASE_TITLE, PHP_EOL, PREG_OFFSET_CAPTURE, PREG_PATTERN_ORDER, PREG_SET_ORDER, PREG_SPLIT_DELIM_CAPTURE, PREG_SPLIT_NO_EMPTY, PREG_SPLIT_OFFSET_CAPTURE, PREG_UNMATCHED_AS_NULL; 16 17 18/** 19 * String tools library. 20 */ 21class Strings 22{ 23 use Nette\StaticClass; 24 25 public const TrimCharacters = " \t\n\r\0\x0B\u{A0}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{200B}\u{2028}\u{3000}"; 26 27 /** @deprecated use Strings::TrimCharacters */ 28 public const TRIM_CHARACTERS = self::TrimCharacters; 29 30 31 /** 32 * @deprecated use Nette\Utils\Validators::isUnicode() 33 */ 34 public static function checkEncoding(string $s): bool 35 { 36 return $s === self::fixEncoding($s); 37 } 38 39 40 /** 41 * Removes all invalid UTF-8 characters from a string. 42 */ 43 public static function fixEncoding(string $s): string 44 { 45 // removes xD800-xDFFF, x110000 and higher 46 return htmlspecialchars_decode(htmlspecialchars($s, ENT_NOQUOTES | ENT_IGNORE, 'UTF-8'), ENT_NOQUOTES); 47 } 48 49 50 /** 51 * Returns a specific character in UTF-8 from code point (number in range 0x0000..D7FF or 0xE000..10FFFF). 52 * @throws Nette\InvalidArgumentException if code point is not in valid range 53 */ 54 public static function chr(int $code): string 55 { 56 if ($code < 0 || ($code >= 0xD800 && $code <= 0xDFFF) || $code > 0x10FFFF) { 57 throw new Nette\InvalidArgumentException('Code point must be in range 0x0 to 0xD7FF or 0xE000 to 0x10FFFF.'); 58 } elseif (!extension_loaded('iconv')) { 59 throw new Nette\NotSupportedException(__METHOD__ . '() requires ICONV extension that is not loaded.'); 60 } 61 62 return iconv('UTF-32BE', 'UTF-8//IGNORE', pack('N', $code)); 63 } 64 65 66 /** 67 * Returns a code point of specific character in UTF-8 (number in range 0x0000..D7FF or 0xE000..10FFFF). 68 */ 69 public static function ord(string $c): int 70 { 71 if (!extension_loaded('iconv')) { 72 throw new Nette\NotSupportedException(__METHOD__ . '() requires ICONV extension that is not loaded.'); 73 } 74 75 $tmp = iconv('UTF-8', 'UTF-32BE//IGNORE', $c); 76 if (!$tmp) { 77 throw new Nette\InvalidArgumentException('Invalid UTF-8 character "' . ($c === '' ? '' : '\x' . strtoupper(bin2hex($c))) . '".'); 78 } 79 80 return unpack('N', $tmp)[1]; 81 } 82 83 84 /** 85 * @deprecated use str_starts_with() 86 */ 87 public static function startsWith(string $haystack, string $needle): bool 88 { 89 return str_starts_with($haystack, $needle); 90 } 91 92 93 /** 94 * @deprecated use str_ends_with() 95 */ 96 public static function endsWith(string $haystack, string $needle): bool 97 { 98 return str_ends_with($haystack, $needle); 99 } 100 101 102 /** 103 * @deprecated use str_contains() 104 */ 105 public static function contains(string $haystack, string $needle): bool 106 { 107 return str_contains($haystack, $needle); 108 } 109 110 111 /** 112 * Returns a part of UTF-8 string specified by starting position and length. If start is negative, 113 * the returned string will start at the start'th character from the end of string. 114 */ 115 public static function substring(string $s, int $start, ?int $length = null): string 116 { 117 if (function_exists('mb_substr')) { 118 return mb_substr($s, $start, $length, 'UTF-8'); // MB is much faster 119 } elseif (!extension_loaded('iconv')) { 120 throw new Nette\NotSupportedException(__METHOD__ . '() requires extension ICONV or MBSTRING, neither is loaded.'); 121 } elseif ($length === null) { 122 $length = self::length($s); 123 } elseif ($start < 0 && $length < 0) { 124 $start += self::length($s); // unifies iconv_substr behavior with mb_substr 125 } 126 127 return iconv_substr($s, $start, $length, 'UTF-8'); 128 } 129 130 131 /** 132 * Removes control characters, normalizes line breaks to `\n`, removes leading and trailing blank lines, 133 * trims end spaces on lines, normalizes UTF-8 to the normal form of NFC. 134 */ 135 public static function normalize(string $s): string 136 { 137 // convert to compressed normal form (NFC) 138 if (class_exists('Normalizer', false) && ($n = \Normalizer::normalize($s, \Normalizer::FORM_C)) !== false) { 139 $s = $n; 140 } 141 142 $s = self::unixNewLines($s); 143 144 // remove control characters; leave \t + \n 145 $s = self::pcre('preg_replace', ['#[\x00-\x08\x0B-\x1F\x7F-\x9F]+#u', '', $s]); 146 147 // right trim 148 $s = self::pcre('preg_replace', ['#[\t ]+$#m', '', $s]); 149 150 // leading and trailing blank lines 151 $s = trim($s, "\n"); 152 153 return $s; 154 } 155 156 157 /** @deprecated use Strings::unixNewLines() */ 158 public static function normalizeNewLines(string $s): string 159 { 160 return self::unixNewLines($s); 161 } 162 163 164 /** 165 * Converts line endings to \n used on Unix-like systems. 166 * Line endings are: \n, \r, \r\n, U+2028 line separator, U+2029 paragraph separator. 167 */ 168 public static function unixNewLines(string $s): string 169 { 170 return preg_replace("~\r\n?|\u{2028}|\u{2029}~", "\n", $s); 171 } 172 173 174 /** 175 * Converts line endings to platform-specific, i.e. \r\n on Windows and \n elsewhere. 176 * Line endings are: \n, \r, \r\n, U+2028 line separator, U+2029 paragraph separator. 177 */ 178 public static function platformNewLines(string $s): string 179 { 180 return preg_replace("~\r\n?|\n|\u{2028}|\u{2029}~", PHP_EOL, $s); 181 } 182 183 184 /** 185 * Converts UTF-8 string to ASCII, ie removes diacritics etc. 186 */ 187 public static function toAscii(string $s): string 188 { 189 $iconv = defined('ICONV_IMPL') ? trim(ICONV_IMPL, '"\'') : null; 190 static $transliterator = null; 191 if ($transliterator === null) { 192 if (class_exists('Transliterator', false)) { 193 $transliterator = \Transliterator::create('Any-Latin; Latin-ASCII'); 194 } else { 195 trigger_error(__METHOD__ . "(): it is recommended to enable PHP extensions 'intl'.", E_USER_NOTICE); 196 $transliterator = false; 197 } 198 } 199 200 // remove control characters and check UTF-8 validity 201 $s = self::pcre('preg_replace', ['#[^\x09\x0A\x0D\x20-\x7E\xA0-\x{2FF}\x{370}-\x{10FFFF}]#u', '', $s]); 202 203 // transliteration (by Transliterator and iconv) is not optimal, replace some characters directly 204 $s = strtr($s, ["\u{201E}" => '"', "\u{201C}" => '"', "\u{201D}" => '"', "\u{201A}" => "'", "\u{2018}" => "'", "\u{2019}" => "'", "\u{B0}" => '^', "\u{42F}" => 'Ya', "\u{44F}" => 'ya', "\u{42E}" => 'Yu', "\u{44E}" => 'yu', "\u{c4}" => 'Ae', "\u{d6}" => 'Oe', "\u{dc}" => 'Ue', "\u{1e9e}" => 'Ss', "\u{e4}" => 'ae', "\u{f6}" => 'oe', "\u{fc}" => 'ue', "\u{df}" => 'ss']); // „ “ ” ‚ ‘ ’ ° Я я Ю ю Ä Ö Ü ẞ ä ö ü ß 205 if ($iconv !== 'libiconv') { 206 $s = strtr($s, ["\u{AE}" => '(R)', "\u{A9}" => '(c)', "\u{2026}" => '...', "\u{AB}" => '<<', "\u{BB}" => '>>', "\u{A3}" => 'lb', "\u{A5}" => 'yen', "\u{B2}" => '^2', "\u{B3}" => '^3', "\u{B5}" => 'u', "\u{B9}" => '^1', "\u{BA}" => 'o', "\u{BF}" => '?', "\u{2CA}" => "'", "\u{2CD}" => '_', "\u{2DD}" => '"', "\u{1FEF}" => '', "\u{20AC}" => 'EUR', "\u{2122}" => 'TM', "\u{212E}" => 'e', "\u{2190}" => '<-', "\u{2191}" => '^', "\u{2192}" => '->', "\u{2193}" => 'V', "\u{2194}" => '<->']); // ® © … « » £ ¥ ² ³ µ ¹ º ¿ ˊ ˍ ˝ ` € ™ ℮ ← ↑ → ↓ ↔ 207 } 208 209 if ($transliterator) { 210 $s = $transliterator->transliterate($s); 211 // use iconv because The transliterator leaves some characters out of ASCII, eg → ʾ 212 if ($iconv === 'glibc') { 213 $s = strtr($s, '?', "\x01"); // temporarily hide ? to distinguish them from the garbage that iconv creates 214 $s = iconv('UTF-8', 'ASCII//TRANSLIT//IGNORE', $s); 215 $s = str_replace(['?', "\x01"], ['', '?'], $s); // remove garbage and restore ? characters 216 } elseif ($iconv === 'libiconv') { 217 $s = iconv('UTF-8', 'ASCII//TRANSLIT//IGNORE', $s); 218 } else { // null or 'unknown' (#216) 219 $s = self::pcre('preg_replace', ['#[^\x00-\x7F]++#', '', $s]); // remove non-ascii chars 220 } 221 } elseif ($iconv === 'glibc' || $iconv === 'libiconv') { 222 // temporarily hide these characters to distinguish them from the garbage that iconv creates 223 $s = strtr($s, '`\'"^~?', "\x01\x02\x03\x04\x05\x06"); 224 if ($iconv === 'glibc') { 225 // glibc implementation is very limited. transliterate into Windows-1250 and then into ASCII, so most Eastern European characters are preserved 226 $s = iconv('UTF-8', 'WINDOWS-1250//TRANSLIT//IGNORE', $s); 227 $s = strtr( 228 $s, 229 "\xa5\xa3\xbc\x8c\xa7\x8a\xaa\x8d\x8f\x8e\xaf\xb9\xb3\xbe\x9c\x9a\xba\x9d\x9f\x9e\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf8\xf9\xfa\xfb\xfc\xfd\xfe\x96\xa0\x8b\x97\x9b\xa6\xad\xb7", 230 'ALLSSSSTZZZallssstzzzRAAAALCCCEEEEIIDDNNOOOOxRUUUUYTsraaaalccceeeeiiddnnooooruuuuyt- <->|-.', 231 ); 232 $s = self::pcre('preg_replace', ['#[^\x00-\x7F]++#', '', $s]); 233 } else { 234 $s = iconv('UTF-8', 'ASCII//TRANSLIT//IGNORE', $s); 235 } 236 237 // remove garbage that iconv creates during transliteration (eg Ý -> Y') 238 $s = str_replace(['`', "'", '"', '^', '~', '?'], '', $s); 239 // restore temporarily hidden characters 240 $s = strtr($s, "\x01\x02\x03\x04\x05\x06", '`\'"^~?'); 241 } else { 242 $s = self::pcre('preg_replace', ['#[^\x00-\x7F]++#', '', $s]); // remove non-ascii chars 243 } 244 245 return $s; 246 } 247 248 249 /** 250 * Modifies the UTF-8 string to the form used in the URL, ie removes diacritics and replaces all characters 251 * except letters of the English alphabet and numbers with a hyphens. 252 */ 253 public static function webalize(string $s, ?string $charlist = null, bool $lower = true): string 254 { 255 $s = self::toAscii($s); 256 if ($lower) { 257 $s = strtolower($s); 258 } 259 260 $s = self::pcre('preg_replace', ['#[^a-z0-9' . ($charlist !== null ? preg_quote($charlist, '#') : '') . ']+#i', '-', $s]); 261 $s = trim($s, '-'); 262 return $s; 263 } 264 265 266 /** 267 * Truncates a UTF-8 string to given maximal length, while trying not to split whole words. Only if the string is truncated, 268 * an ellipsis (or something else set with third argument) is appended to the string. 269 */ 270 public static function truncate(string $s, int $maxLen, string $append = "\u{2026}"): string 271 { 272 if (self::length($s) > $maxLen) { 273 $maxLen -= self::length($append); 274 if ($maxLen < 1) { 275 return $append; 276 277 } elseif ($matches = self::match($s, '#^.{1,' . $maxLen . '}(?=[\s\x00-/:-@\[-`{-~])#us')) { 278 return $matches[0] . $append; 279 280 } else { 281 return self::substring($s, 0, $maxLen) . $append; 282 } 283 } 284 285 return $s; 286 } 287 288 289 /** 290 * Indents a multiline text from the left. Second argument sets how many indentation chars should be used, 291 * while the indent itself is the third argument (*tab* by default). 292 */ 293 public static function indent(string $s, int $level = 1, string $chars = "\t"): string 294 { 295 if ($level > 0) { 296 $s = self::replace($s, '#(?:^|[\r\n]+)(?=[^\r\n])#', '$0' . str_repeat($chars, $level)); 297 } 298 299 return $s; 300 } 301 302 303 /** 304 * Converts all characters of UTF-8 string to lower case. 305 */ 306 public static function lower(string $s): string 307 { 308 return mb_strtolower($s, 'UTF-8'); 309 } 310 311 312 /** 313 * Converts the first character of a UTF-8 string to lower case and leaves the other characters unchanged. 314 */ 315 public static function firstLower(string $s): string 316 { 317 return self::lower(self::substring($s, 0, 1)) . self::substring($s, 1); 318 } 319 320 321 /** 322 * Converts all characters of a UTF-8 string to upper case. 323 */ 324 public static function upper(string $s): string 325 { 326 return mb_strtoupper($s, 'UTF-8'); 327 } 328 329 330 /** 331 * Converts the first character of a UTF-8 string to upper case and leaves the other characters unchanged. 332 */ 333 public static function firstUpper(string $s): string 334 { 335 return self::upper(self::substring($s, 0, 1)) . self::substring($s, 1); 336 } 337 338 339 /** 340 * Converts the first character of every word of a UTF-8 string to upper case and the others to lower case. 341 */ 342 public static function capitalize(string $s): string 343 { 344 return mb_convert_case($s, MB_CASE_TITLE, 'UTF-8'); 345 } 346 347 348 /** 349 * Compares two UTF-8 strings or their parts, without taking character case into account. If length is null, whole strings are compared, 350 * if it is negative, the corresponding number of characters from the end of the strings is compared, 351 * otherwise the appropriate number of characters from the beginning is compared. 352 */ 353 public static function compare(string $left, string $right, ?int $length = null): bool 354 { 355 if (class_exists('Normalizer', false)) { 356 $left = \Normalizer::normalize($left, \Normalizer::FORM_D); // form NFD is faster 357 $right = \Normalizer::normalize($right, \Normalizer::FORM_D); // form NFD is faster 358 } 359 360 if ($length < 0) { 361 $left = self::substring($left, $length, -$length); 362 $right = self::substring($right, $length, -$length); 363 } elseif ($length !== null) { 364 $left = self::substring($left, 0, $length); 365 $right = self::substring($right, 0, $length); 366 } 367 368 return self::lower($left) === self::lower($right); 369 } 370 371 372 /** 373 * Finds the common prefix of strings or returns empty string if the prefix was not found. 374 * @param string[] $strings 375 */ 376 public static function findPrefix(array $strings): string 377 { 378 $first = array_shift($strings); 379 for ($i = 0; $i < strlen($first); $i++) { 380 foreach ($strings as $s) { 381 if (!isset($s[$i]) || $first[$i] !== $s[$i]) { 382 while ($i && $first[$i - 1] >= "\x80" && $first[$i] >= "\x80" && $first[$i] < "\xC0") { 383 $i--; 384 } 385 386 return substr($first, 0, $i); 387 } 388 } 389 } 390 391 return $first; 392 } 393 394 395 /** 396 * Returns number of characters (not bytes) in UTF-8 string. 397 * That is the number of Unicode code points which may differ from the number of graphemes. 398 */ 399 public static function length(string $s): int 400 { 401 return match (true) { 402 extension_loaded('mbstring') => mb_strlen($s, 'UTF-8'), 403 extension_loaded('iconv') => iconv_strlen($s, 'UTF-8'), 404 default => strlen(@utf8_decode($s)), // deprecated 405 }; 406 } 407 408 409 /** 410 * Removes all left and right side spaces (or the characters passed as second argument) from a UTF-8 encoded string. 411 */ 412 public static function trim(string $s, string $charlist = self::TrimCharacters): string 413 { 414 $charlist = preg_quote($charlist, '#'); 415 return self::replace($s, '#^[' . $charlist . ']+|[' . $charlist . ']+$#Du', ''); 416 } 417 418 419 /** 420 * Pads a UTF-8 string to given length by prepending the $pad string to the beginning. 421 * @param non-empty-string $pad 422 */ 423 public static function padLeft(string $s, int $length, string $pad = ' '): string 424 { 425 $length = max(0, $length - self::length($s)); 426 $padLen = self::length($pad); 427 return str_repeat($pad, (int) ($length / $padLen)) . self::substring($pad, 0, $length % $padLen) . $s; 428 } 429 430 431 /** 432 * Pads UTF-8 string to given length by appending the $pad string to the end. 433 * @param non-empty-string $pad 434 */ 435 public static function padRight(string $s, int $length, string $pad = ' '): string 436 { 437 $length = max(0, $length - self::length($s)); 438 $padLen = self::length($pad); 439 return $s . str_repeat($pad, (int) ($length / $padLen)) . self::substring($pad, 0, $length % $padLen); 440 } 441 442 443 /** 444 * Reverses UTF-8 string. 445 */ 446 public static function reverse(string $s): string 447 { 448 if (!extension_loaded('iconv')) { 449 throw new Nette\NotSupportedException(__METHOD__ . '() requires ICONV extension that is not loaded.'); 450 } 451 452 return iconv('UTF-32LE', 'UTF-8', strrev(iconv('UTF-8', 'UTF-32BE', $s))); 453 } 454 455 456 /** 457 * Returns part of $haystack before $nth occurence of $needle or returns null if the needle was not found. 458 * Negative value means searching from the end. 459 */ 460 public static function before(string $haystack, string $needle, int $nth = 1): ?string 461 { 462 $pos = self::pos($haystack, $needle, $nth); 463 return $pos === null 464 ? null 465 : substr($haystack, 0, $pos); 466 } 467 468 469 /** 470 * Returns part of $haystack after $nth occurence of $needle or returns null if the needle was not found. 471 * Negative value means searching from the end. 472 */ 473 public static function after(string $haystack, string $needle, int $nth = 1): ?string 474 { 475 $pos = self::pos($haystack, $needle, $nth); 476 return $pos === null 477 ? null 478 : substr($haystack, $pos + strlen($needle)); 479 } 480 481 482 /** 483 * Returns position in characters of $nth occurence of $needle in $haystack or null if the $needle was not found. 484 * Negative value of `$nth` means searching from the end. 485 */ 486 public static function indexOf(string $haystack, string $needle, int $nth = 1): ?int 487 { 488 $pos = self::pos($haystack, $needle, $nth); 489 return $pos === null 490 ? null 491 : self::length(substr($haystack, 0, $pos)); 492 } 493 494 495 /** 496 * Returns position in characters of $nth occurence of $needle in $haystack or null if the needle was not found. 497 */ 498 private static function pos(string $haystack, string $needle, int $nth = 1): ?int 499 { 500 if (!$nth) { 501 return null; 502 } elseif ($nth > 0) { 503 if ($needle === '') { 504 return 0; 505 } 506 507 $pos = 0; 508 while (($pos = strpos($haystack, $needle, $pos)) !== false && --$nth) { 509 $pos++; 510 } 511 } else { 512 $len = strlen($haystack); 513 if ($needle === '') { 514 return $len; 515 } elseif ($len === 0) { 516 return null; 517 } 518 519 $pos = $len - 1; 520 while (($pos = strrpos($haystack, $needle, $pos - $len)) !== false && ++$nth) { 521 $pos--; 522 } 523 } 524 525 return Helpers::falseToNull($pos); 526 } 527 528 529 /** 530 * Divides the string into arrays according to the regular expression. Expressions in parentheses will be captured and returned as well. 531 */ 532 public static function split( 533 string $subject, 534 #[Language('RegExp')] 535 string $pattern, 536 bool|int $captureOffset = false, 537 bool $skipEmpty = false, 538 int $limit = -1, 539 bool $utf8 = false, 540 ): array 541 { 542 $flags = is_int($captureOffset) // back compatibility 543 ? $captureOffset 544 : ($captureOffset ? PREG_SPLIT_OFFSET_CAPTURE : 0) | ($skipEmpty ? PREG_SPLIT_NO_EMPTY : 0); 545 546 $pattern .= $utf8 ? 'u' : ''; 547 $m = self::pcre('preg_split', [$pattern, $subject, $limit, $flags | PREG_SPLIT_DELIM_CAPTURE]); 548 return $utf8 && $captureOffset 549 ? self::bytesToChars($subject, [$m])[0] 550 : $m; 551 } 552 553 554 /** 555 * Searches the string for the part matching the regular expression and returns 556 * an array with the found expression and individual subexpressions, or `null`. 557 */ 558 public static function match( 559 string $subject, 560 #[Language('RegExp')] 561 string $pattern, 562 bool|int $captureOffset = false, 563 int $offset = 0, 564 bool $unmatchedAsNull = false, 565 bool $utf8 = false, 566 ): ?array 567 { 568 $flags = is_int($captureOffset) // back compatibility 569 ? $captureOffset 570 : ($captureOffset ? PREG_OFFSET_CAPTURE : 0) | ($unmatchedAsNull ? PREG_UNMATCHED_AS_NULL : 0); 571 572 if ($utf8) { 573 $offset = strlen(self::substring($subject, 0, $offset)); 574 $pattern .= 'u'; 575 } 576 577 if ($offset > strlen($subject)) { 578 return null; 579 } elseif (!self::pcre('preg_match', [$pattern, $subject, &$m, $flags, $offset])) { 580 return null; 581 } elseif ($utf8 && $captureOffset) { 582 return self::bytesToChars($subject, [$m])[0]; 583 } else { 584 return $m; 585 } 586 } 587 588 589 /** 590 * Searches the string for all occurrences matching the regular expression and 591 * returns an array of arrays containing the found expression and each subexpression. 592 * @return ($lazy is true ? \Generator<int, array> : array[]) 593 */ 594 public static function matchAll( 595 string $subject, 596 #[Language('RegExp')] 597 string $pattern, 598 bool|int $captureOffset = false, 599 int $offset = 0, 600 bool $unmatchedAsNull = false, 601 bool $patternOrder = false, 602 bool $utf8 = false, 603 bool $lazy = false, 604 ): array|\Generator 605 { 606 if ($utf8) { 607 $offset = strlen(self::substring($subject, 0, $offset)); 608 $pattern .= 'u'; 609 } 610 611 if ($lazy) { 612 $flags = PREG_OFFSET_CAPTURE | ($unmatchedAsNull ? PREG_UNMATCHED_AS_NULL : 0); 613 return (function () use ($utf8, $captureOffset, $flags, $subject, $pattern, $offset) { 614 $counter = 0; 615 while ( 616 $offset <= strlen($subject) - ($counter ? 1 : 0) 617 && self::pcre('preg_match', [$pattern, $subject, &$m, $flags, $offset]) 618 ) { 619 $offset = $m[0][1] + max(1, strlen($m[0][0])); 620 if (!$captureOffset) { 621 $m = array_map(fn($item) => $item[0], $m); 622 } elseif ($utf8) { 623 $m = self::bytesToChars($subject, [$m])[0]; 624 } 625 yield $counter++ => $m; 626 } 627 })(); 628 } 629 630 if ($offset > strlen($subject)) { 631 return []; 632 } 633 634 $flags = is_int($captureOffset) // back compatibility 635 ? $captureOffset 636 : ($captureOffset ? PREG_OFFSET_CAPTURE : 0) | ($unmatchedAsNull ? PREG_UNMATCHED_AS_NULL : 0) | ($patternOrder ? PREG_PATTERN_ORDER : 0); 637 638 self::pcre('preg_match_all', [ 639 $pattern, $subject, &$m, 640 ($flags & PREG_PATTERN_ORDER) ? $flags : ($flags | PREG_SET_ORDER), 641 $offset, 642 ]); 643 return $utf8 && $captureOffset 644 ? self::bytesToChars($subject, $m) 645 : $m; 646 } 647 648 649 /** 650 * Replaces all occurrences matching regular expression $pattern which can be string or array in the form `pattern => replacement`. 651 */ 652 public static function replace( 653 string $subject, 654 #[Language('RegExp')] 655 string|array $pattern, 656 string|callable $replacement = '', 657 int $limit = -1, 658 bool $captureOffset = false, 659 bool $unmatchedAsNull = false, 660 bool $utf8 = false, 661 ): string 662 { 663 if (is_object($replacement) || is_array($replacement)) { 664 if (!is_callable($replacement, false, $textual)) { 665 throw new Nette\InvalidStateException("Callback '$textual' is not callable."); 666 } 667 668 $flags = ($captureOffset ? PREG_OFFSET_CAPTURE : 0) | ($unmatchedAsNull ? PREG_UNMATCHED_AS_NULL : 0); 669 if ($utf8) { 670 $pattern .= 'u'; 671 if ($captureOffset) { 672 $replacement = fn($m) => $replacement(self::bytesToChars($subject, [$m])[0]); 673 } 674 } 675 676 return self::pcre('preg_replace_callback', [$pattern, $replacement, $subject, $limit, 0, $flags]); 677 678 } elseif (is_array($pattern) && is_string(key($pattern))) { 679 $replacement = array_values($pattern); 680 $pattern = array_keys($pattern); 681 } 682 683 if ($utf8) { 684 $pattern = array_map(fn($item) => $item . 'u', (array) $pattern); 685 } 686 687 return self::pcre('preg_replace', [$pattern, $replacement, $subject, $limit]); 688 } 689 690 691 private static function bytesToChars(string $s, array $groups): array 692 { 693 $lastBytes = $lastChars = 0; 694 foreach ($groups as &$matches) { 695 foreach ($matches as &$match) { 696 if ($match[1] > $lastBytes) { 697 $lastChars += self::length(substr($s, $lastBytes, $match[1] - $lastBytes)); 698 } elseif ($match[1] < $lastBytes) { 699 $lastChars -= self::length(substr($s, $match[1], $lastBytes - $match[1])); 700 } 701 702 $lastBytes = $match[1]; 703 $match[1] = $lastChars; 704 } 705 } 706 707 return $groups; 708 } 709 710 711 /** @internal */ 712 public static function pcre(string $func, array $args) 713 { 714 $res = Callback::invokeSafe($func, $args, function (string $message) use ($args): void { 715 // compile-time error, not detectable by preg_last_error 716 throw new RegexpException($message . ' in pattern: ' . implode(' or ', (array) $args[0])); 717 }); 718 719 if (($code = preg_last_error()) // run-time error, but preg_last_error & return code are liars 720 && ($res === null || !in_array($func, ['preg_filter', 'preg_replace_callback', 'preg_replace'], true)) 721 ) { 722 throw new RegexpException(preg_last_error_msg() 723 . ' (pattern: ' . implode(' or ', (array) $args[0]) . ')', $code); 724 } 725 726 return $res; 727 } 728}