friendship ended with social-app. php is my new best friend
1<?php 2 3/* 4 * This file is part of the Symfony package. 5 * 6 * (c) Fabien Potencier <fabien@symfony.com> 7 * 8 * For the full copyright and license information, please view the LICENSE 9 * file that was distributed with this source code. 10 */ 11 12namespace Symfony\Component\DomCrawler; 13 14use Masterminds\HTML5; 15use Symfony\Component\CssSelector\CssSelectorConverter; 16 17/** 18 * Crawler eases navigation of a list of \DOMNode objects. 19 * 20 * @author Fabien Potencier <fabien@symfony.com> 21 * 22 * @implements \IteratorAggregate<int, \DOMNode> 23 */ 24class Crawler implements \Countable, \IteratorAggregate 25{ 26 /** 27 * The default namespace prefix to be used with XPath and CSS expressions. 28 */ 29 private string $defaultNamespacePrefix = 'default'; 30 31 /** 32 * A map of manually registered namespaces. 33 * 34 * @var array<string, string> 35 */ 36 private array $namespaces = []; 37 38 /** 39 * A map of cached namespaces. 40 */ 41 private \ArrayObject $cachedNamespaces; 42 43 private ?string $baseHref; 44 private ?\DOMDocument $document = null; 45 46 /** 47 * @var list<\DOMNode> 48 */ 49 private array $nodes = []; 50 51 /** 52 * Whether the Crawler contains HTML or XML content (used when converting CSS to XPath). 53 */ 54 private bool $isHtml = true; 55 56 private ?HTML5 $html5Parser = null; 57 58 /** 59 * @param \DOMNodeList|\DOMNode|\DOMNode[]|string|null $node A Node to use as the base for the crawling 60 */ 61 public function __construct( 62 \DOMNodeList|\DOMNode|array|string|null $node = null, 63 protected ?string $uri = null, 64 ?string $baseHref = null, 65 bool $useHtml5Parser = true, 66 ) { 67 $this->baseHref = $baseHref ?: $uri; 68 $this->html5Parser = $useHtml5Parser ? new HTML5(['disable_html_ns' => true]) : null; 69 $this->cachedNamespaces = new \ArrayObject(); 70 71 $this->add($node); 72 } 73 74 /** 75 * Returns the current URI. 76 */ 77 public function getUri(): ?string 78 { 79 return $this->uri; 80 } 81 82 /** 83 * Returns base href. 84 */ 85 public function getBaseHref(): ?string 86 { 87 return $this->baseHref; 88 } 89 90 /** 91 * Removes all the nodes. 92 */ 93 public function clear(): void 94 { 95 $this->nodes = []; 96 $this->document = null; 97 $this->cachedNamespaces = new \ArrayObject(); 98 } 99 100 /** 101 * Adds a node to the current list of nodes. 102 * 103 * This method uses the appropriate specialized add*() method based 104 * on the type of the argument. 105 * 106 * @param \DOMNodeList|\DOMNode|\DOMNode[]|string|null $node A node 107 * 108 * @throws \InvalidArgumentException when node is not the expected type 109 */ 110 public function add(\DOMNodeList|\DOMNode|array|string|null $node): void 111 { 112 if ($node instanceof \DOMNodeList) { 113 $this->addNodeList($node); 114 } elseif ($node instanceof \DOMNode) { 115 $this->addNode($node); 116 } elseif (\is_array($node)) { 117 $this->addNodes($node); 118 } elseif (\is_string($node)) { 119 $this->addContent($node); 120 } elseif (null !== $node) { 121 throw new \InvalidArgumentException(\sprintf('Expecting a DOMNodeList or DOMNode instance, an array, a string, or null, but got "%s".', get_debug_type($node))); 122 } 123 } 124 125 /** 126 * Adds HTML/XML content. 127 * 128 * If the charset is not set via the content type, it is assumed to be UTF-8, 129 * or ISO-8859-1 as a fallback, which is the default charset defined by the 130 * HTTP 1.1 specification. 131 */ 132 public function addContent(string $content, ?string $type = null): void 133 { 134 if (!$type) { 135 $type = str_starts_with($content, '<?xml') ? 'application/xml' : 'text/html'; 136 } 137 138 // DOM only for HTML/XML content 139 if (!preg_match('/(x|ht)ml/i', $type, $xmlMatches)) { 140 return; 141 } 142 143 $charset = preg_match('//u', $content) ? 'UTF-8' : 'ISO-8859-1'; 144 145 // http://www.w3.org/TR/encoding/#encodings 146 // http://www.w3.org/TR/REC-xml/#NT-EncName 147 $content = preg_replace_callback('/(charset *= *["\']?)([a-zA-Z\-0-9_:.]+)/i', function ($m) use (&$charset) { 148 if ('charset=' === $this->convertToHtmlEntities('charset=', $m[2])) { 149 $charset = $m[2]; 150 } 151 152 return $m[1].$charset; 153 }, $content, 1); 154 155 if ('x' === $xmlMatches[1]) { 156 $this->addXmlContent($content, $charset); 157 } else { 158 $this->addHtmlContent($content, $charset); 159 } 160 } 161 162 /** 163 * Adds an HTML content to the list of nodes. 164 * 165 * The libxml errors are disabled when the content is parsed. 166 * 167 * If you want to get parsing errors, be sure to enable 168 * internal errors via libxml_use_internal_errors(true) 169 * and then, get the errors via libxml_get_errors(). Be 170 * sure to clear errors with libxml_clear_errors() afterward. 171 */ 172 public function addHtmlContent(string $content, string $charset = 'UTF-8'): void 173 { 174 $dom = $this->parseHtmlString($content, $charset); 175 $this->addDocument($dom); 176 177 $base = $this->filterRelativeXPath('descendant-or-self::base')->extract(['href']); 178 179 $baseHref = current($base); 180 if (\count($base) && $baseHref) { 181 if ($this->baseHref) { 182 $linkNode = $dom->createElement('a'); 183 $linkNode->setAttribute('href', $baseHref); 184 $link = new Link($linkNode, $this->baseHref); 185 $this->baseHref = $link->getUri(); 186 } else { 187 $this->baseHref = $baseHref; 188 } 189 } 190 } 191 192 /** 193 * Adds an XML content to the list of nodes. 194 * 195 * The libxml errors are disabled when the content is parsed. 196 * 197 * If you want to get parsing errors, be sure to enable 198 * internal errors via libxml_use_internal_errors(true) 199 * and then, get the errors via libxml_get_errors(). Be 200 * sure to clear errors with libxml_clear_errors() afterward. 201 * 202 * @param int $options Bitwise OR of the libxml option constants 203 * LIBXML_PARSEHUGE is dangerous, see 204 * http://symfony.com/blog/security-release-symfony-2-0-17-released 205 */ 206 public function addXmlContent(string $content, string $charset = 'UTF-8', int $options = \LIBXML_NONET): void 207 { 208 // remove the default namespace if it's the only namespace to make XPath expressions simpler 209 if (!str_contains($content, 'xmlns:')) { 210 $content = str_replace('xmlns', 'ns', $content); 211 } 212 213 $internalErrors = libxml_use_internal_errors(true); 214 215 $dom = new \DOMDocument('1.0', $charset); 216 $dom->validateOnParse = true; 217 218 if ('' !== trim($content)) { 219 @$dom->loadXML($content, $options); 220 } 221 222 libxml_use_internal_errors($internalErrors); 223 224 $this->addDocument($dom); 225 226 $this->isHtml = false; 227 } 228 229 /** 230 * Adds a \DOMDocument to the list of nodes. 231 * 232 * @param \DOMDocument $dom A \DOMDocument instance 233 */ 234 public function addDocument(\DOMDocument $dom): void 235 { 236 if ($dom->documentElement) { 237 $this->addNode($dom->documentElement); 238 } 239 } 240 241 /** 242 * Adds a \DOMNodeList to the list of nodes. 243 * 244 * @param \DOMNodeList $nodes A \DOMNodeList instance 245 */ 246 public function addNodeList(\DOMNodeList $nodes): void 247 { 248 foreach ($nodes as $node) { 249 if ($node instanceof \DOMNode) { 250 $this->addNode($node); 251 } 252 } 253 } 254 255 /** 256 * Adds an array of \DOMNode instances to the list of nodes. 257 * 258 * @param \DOMNode[] $nodes An array of \DOMNode instances 259 */ 260 public function addNodes(array $nodes): void 261 { 262 foreach ($nodes as $node) { 263 $this->add($node); 264 } 265 } 266 267 /** 268 * Adds a \DOMNode instance to the list of nodes. 269 * 270 * @param \DOMNode $node A \DOMNode instance 271 */ 272 public function addNode(\DOMNode $node): void 273 { 274 if ($node instanceof \DOMDocument) { 275 $node = $node->documentElement; 276 } 277 278 if (null !== $this->document && $this->document !== $node->ownerDocument) { 279 throw new \InvalidArgumentException('Attaching DOM nodes from multiple documents in the same crawler is forbidden.'); 280 } 281 282 $this->document ??= $node->ownerDocument; 283 284 // Don't add duplicate nodes in the Crawler 285 if (\in_array($node, $this->nodes, true)) { 286 return; 287 } 288 289 $this->nodes[] = $node; 290 } 291 292 /** 293 * Returns a node given its position in the node list. 294 */ 295 public function eq(int $position): static 296 { 297 if (isset($this->nodes[$position])) { 298 return $this->createSubCrawler($this->nodes[$position]); 299 } 300 301 return $this->createSubCrawler(null); 302 } 303 304 /** 305 * Calls an anonymous function on each node of the list. 306 * 307 * The anonymous function receives the position and the node wrapped 308 * in a Crawler instance as arguments. 309 * 310 * Example: 311 * 312 * $crawler->filter('h1')->each(function ($node, $i) { 313 * return $node->text(); 314 * }); 315 * 316 * @param \Closure $closure An anonymous function 317 * 318 * @return array An array of values returned by the anonymous function 319 */ 320 public function each(\Closure $closure): array 321 { 322 $data = []; 323 foreach ($this->nodes as $i => $node) { 324 $data[] = $closure($this->createSubCrawler($node), $i); 325 } 326 327 return $data; 328 } 329 330 /** 331 * Slices the list of nodes by $offset and $length. 332 */ 333 public function slice(int $offset = 0, ?int $length = null): static 334 { 335 return $this->createSubCrawler(\array_slice($this->nodes, $offset, $length)); 336 } 337 338 /** 339 * Reduces the list of nodes by calling an anonymous function. 340 * 341 * To remove a node from the list, the anonymous function must return false. 342 * 343 * @param \Closure $closure An anonymous function 344 */ 345 public function reduce(\Closure $closure): static 346 { 347 $nodes = []; 348 foreach ($this->nodes as $i => $node) { 349 if (false !== $closure($this->createSubCrawler($node), $i)) { 350 $nodes[] = $node; 351 } 352 } 353 354 return $this->createSubCrawler($nodes); 355 } 356 357 /** 358 * Returns the first node of the current selection. 359 */ 360 public function first(): static 361 { 362 return $this->eq(0); 363 } 364 365 /** 366 * Returns the last node of the current selection. 367 */ 368 public function last(): static 369 { 370 return $this->eq(\count($this->nodes) - 1); 371 } 372 373 /** 374 * Returns the siblings nodes of the current selection. 375 * 376 * @throws \InvalidArgumentException When current node is empty 377 */ 378 public function siblings(): static 379 { 380 if (!$this->nodes) { 381 throw new \InvalidArgumentException('The current node list is empty.'); 382 } 383 384 return $this->createSubCrawler($this->sibling($this->getNode(0)->parentNode->firstChild)); 385 } 386 387 public function matches(string $selector): bool 388 { 389 if (!$this->nodes) { 390 return false; 391 } 392 393 $converter = $this->createCssSelectorConverter(); 394 $xpath = $converter->toXPath($selector, 'self::'); 395 396 return 0 !== $this->filterRelativeXPath($xpath)->count(); 397 } 398 399 /** 400 * Return first parents (heading toward the document root) of the Element that matches the provided selector. 401 * 402 * @see https://developer.mozilla.org/en-US/docs/Web/API/Element/closest#Polyfill 403 * 404 * @throws \InvalidArgumentException When current node is empty 405 */ 406 public function closest(string $selector): ?self 407 { 408 if (!$this->nodes) { 409 throw new \InvalidArgumentException('The current node list is empty.'); 410 } 411 412 $domNode = $this->getNode(0); 413 414 while (null !== $domNode && \XML_ELEMENT_NODE === $domNode->nodeType) { 415 $node = $this->createSubCrawler($domNode); 416 if ($node->matches($selector)) { 417 return $node; 418 } 419 420 $domNode = $node->getNode(0)->parentNode; 421 } 422 423 return null; 424 } 425 426 /** 427 * Returns the next siblings nodes of the current selection. 428 * 429 * @throws \InvalidArgumentException When current node is empty 430 */ 431 public function nextAll(): static 432 { 433 if (!$this->nodes) { 434 throw new \InvalidArgumentException('The current node list is empty.'); 435 } 436 437 return $this->createSubCrawler($this->sibling($this->getNode(0))); 438 } 439 440 /** 441 * Returns the previous sibling nodes of the current selection. 442 * 443 * @throws \InvalidArgumentException 444 */ 445 public function previousAll(): static 446 { 447 if (!$this->nodes) { 448 throw new \InvalidArgumentException('The current node list is empty.'); 449 } 450 451 return $this->createSubCrawler($this->sibling($this->getNode(0), 'previousSibling')); 452 } 453 454 /** 455 * Returns the ancestors of the current selection. 456 * 457 * @throws \InvalidArgumentException When the current node is empty 458 */ 459 public function ancestors(): static 460 { 461 if (!$this->nodes) { 462 throw new \InvalidArgumentException('The current node list is empty.'); 463 } 464 465 $node = $this->getNode(0); 466 $nodes = []; 467 468 while ($node = $node->parentNode) { 469 if (\XML_ELEMENT_NODE === $node->nodeType) { 470 $nodes[] = $node; 471 } 472 } 473 474 return $this->createSubCrawler($nodes); 475 } 476 477 /** 478 * Returns the children nodes of the current selection. 479 * 480 * @throws \InvalidArgumentException When current node is empty 481 * @throws \RuntimeException If the CssSelector Component is not available and $selector is provided 482 */ 483 public function children(?string $selector = null): static 484 { 485 if (!$this->nodes) { 486 throw new \InvalidArgumentException('The current node list is empty.'); 487 } 488 489 if (null !== $selector) { 490 $converter = $this->createCssSelectorConverter(); 491 $xpath = $converter->toXPath($selector, 'child::'); 492 493 return $this->filterRelativeXPath($xpath); 494 } 495 496 $node = $this->getNode(0)->firstChild; 497 498 return $this->createSubCrawler($node ? $this->sibling($node) : []); 499 } 500 501 /** 502 * Returns the attribute value of the first node of the list. 503 * 504 * @param string|null $default When not null: the value to return when the node or attribute is empty 505 * 506 * @throws \InvalidArgumentException When current node is empty 507 */ 508 public function attr(string $attribute, ?string $default = null): ?string 509 { 510 if (!$this->nodes) { 511 if (null !== $default) { 512 return $default; 513 } 514 515 throw new \InvalidArgumentException('The current node list is empty.'); 516 } 517 518 $node = $this->getNode(0); 519 520 return $node->hasAttribute($attribute) ? $node->getAttribute($attribute) : $default; 521 } 522 523 /** 524 * Returns the node name of the first node of the list. 525 * 526 * @throws \InvalidArgumentException When current node is empty 527 */ 528 public function nodeName(): string 529 { 530 if (!$this->nodes) { 531 throw new \InvalidArgumentException('The current node list is empty.'); 532 } 533 534 return $this->getNode(0)->nodeName; 535 } 536 537 /** 538 * Returns the text of the first node of the list. 539 * 540 * Pass true as the second argument to normalize whitespaces. 541 * 542 * @param string|null $default When not null: the value to return when the current node is empty 543 * @param bool $normalizeWhitespace Whether whitespaces should be trimmed and normalized to single spaces 544 * 545 * @throws \InvalidArgumentException When current node is empty 546 */ 547 public function text(?string $default = null, bool $normalizeWhitespace = true): string 548 { 549 if (!$this->nodes) { 550 if (null !== $default) { 551 return $default; 552 } 553 554 throw new \InvalidArgumentException('The current node list is empty.'); 555 } 556 557 $text = $this->getNode(0)->nodeValue; 558 559 if ($normalizeWhitespace) { 560 return $this->normalizeWhitespace($text); 561 } 562 563 return $text; 564 } 565 566 /** 567 * Returns only the inner text that is the direct descendent of the current node, excluding any child nodes. 568 * 569 * @param bool $normalizeWhitespace Whether whitespaces should be trimmed and normalized to single spaces 570 */ 571 public function innerText(bool $normalizeWhitespace = true): string 572 { 573 foreach ($this->getNode(0)->childNodes as $childNode) { 574 if (\XML_TEXT_NODE !== $childNode->nodeType && \XML_CDATA_SECTION_NODE !== $childNode->nodeType) { 575 continue; 576 } 577 if (!$normalizeWhitespace) { 578 return $childNode->nodeValue; 579 } 580 if ('' !== trim($childNode->nodeValue)) { 581 return $this->normalizeWhitespace($childNode->nodeValue); 582 } 583 } 584 585 return ''; 586 } 587 588 /** 589 * Returns the first node of the list as HTML. 590 * 591 * @param string|null $default When not null: the value to return when the current node is empty 592 * 593 * @throws \InvalidArgumentException When current node is empty 594 */ 595 public function html(?string $default = null): string 596 { 597 if (!$this->nodes) { 598 if (null !== $default) { 599 return $default; 600 } 601 602 throw new \InvalidArgumentException('The current node list is empty.'); 603 } 604 605 $node = $this->getNode(0); 606 $owner = $node->ownerDocument; 607 608 if ($this->html5Parser && '<!DOCTYPE html>' === $owner->saveXML($owner->childNodes[0])) { 609 $owner = $this->html5Parser; 610 } 611 612 $html = ''; 613 foreach ($node->childNodes as $child) { 614 $html .= $owner->saveHTML($child); 615 } 616 617 return $html; 618 } 619 620 public function outerHtml(): string 621 { 622 if (!\count($this)) { 623 throw new \InvalidArgumentException('The current node list is empty.'); 624 } 625 626 $node = $this->getNode(0); 627 $owner = $node->ownerDocument; 628 629 if ($this->html5Parser && '<!DOCTYPE html>' === $owner->saveXML($owner->childNodes[0])) { 630 $owner = $this->html5Parser; 631 } 632 633 return $owner->saveHTML($node); 634 } 635 636 /** 637 * Evaluates an XPath expression. 638 * 639 * Since an XPath expression might evaluate to either a simple type or a \DOMNodeList, 640 * this method will return either an array of simple types or a new Crawler instance. 641 */ 642 public function evaluate(string $xpath): array|self 643 { 644 if (null === $this->document) { 645 throw new \LogicException('Cannot evaluate the expression on an uninitialized crawler.'); 646 } 647 648 $data = []; 649 $domxpath = $this->createDOMXPath($this->document, $this->findNamespacePrefixes($xpath)); 650 651 foreach ($this->nodes as $node) { 652 $data[] = $domxpath->evaluate($xpath, $node); 653 } 654 655 if (isset($data[0]) && $data[0] instanceof \DOMNodeList) { 656 return $this->createSubCrawler($data); 657 } 658 659 return $data; 660 } 661 662 /** 663 * Extracts information from the list of nodes. 664 * 665 * You can extract attributes or/and the node value (_text). 666 * 667 * Example: 668 * 669 * $crawler->filter('h1 a')->extract(['_text', 'href']); 670 */ 671 public function extract(array $attributes): array 672 { 673 $count = \count($attributes); 674 675 $data = []; 676 foreach ($this->nodes as $node) { 677 $elements = []; 678 foreach ($attributes as $attribute) { 679 if ('_text' === $attribute) { 680 $elements[] = $node->nodeValue; 681 } elseif ('_name' === $attribute) { 682 $elements[] = $node->nodeName; 683 } else { 684 $elements[] = $node->getAttribute($attribute); 685 } 686 } 687 688 $data[] = 1 === $count ? $elements[0] : $elements; 689 } 690 691 return $data; 692 } 693 694 /** 695 * Filters the list of nodes with an XPath expression. 696 * 697 * The XPath expression is evaluated in the context of the crawler, which 698 * is considered as a fake parent of the elements inside it. 699 * This means that a child selector "div" or "./div" will match only 700 * the div elements of the current crawler, not their children. 701 */ 702 public function filterXPath(string $xpath): static 703 { 704 $xpath = $this->relativize($xpath); 705 706 // If we dropped all expressions in the XPath while preparing it, there would be no match 707 if ('' === $xpath) { 708 return $this->createSubCrawler(null); 709 } 710 711 return $this->filterRelativeXPath($xpath); 712 } 713 714 /** 715 * Filters the list of nodes with a CSS selector. 716 * 717 * This method only works if you have installed the CssSelector Symfony Component. 718 * 719 * @throws \LogicException if the CssSelector Component is not available 720 */ 721 public function filter(string $selector): static 722 { 723 $converter = $this->createCssSelectorConverter(); 724 725 // The CssSelector already prefixes the selector with descendant-or-self:: 726 return $this->filterRelativeXPath($converter->toXPath($selector)); 727 } 728 729 /** 730 * Selects links by name or alt value for clickable images. 731 */ 732 public function selectLink(string $value): static 733 { 734 return $this->filterRelativeXPath( 735 \sprintf('descendant-or-self::a[contains(concat(\' \', normalize-space(string(.)), \' \'), %1$s) or ./img[contains(concat(\' \', normalize-space(string(@alt)), \' \'), %1$s)]]', static::xpathLiteral(' '.$value.' ')) 736 ); 737 } 738 739 /** 740 * Selects images by alt value. 741 */ 742 public function selectImage(string $value): static 743 { 744 $xpath = \sprintf('descendant-or-self::img[contains(normalize-space(string(@alt)), %s)]', static::xpathLiteral($value)); 745 746 return $this->filterRelativeXPath($xpath); 747 } 748 749 /** 750 * Selects a button by its text content, id, value, name or alt attribute. 751 */ 752 public function selectButton(string $value): static 753 { 754 return $this->filterRelativeXPath( 755 \sprintf('descendant-or-self::input[((contains(%1$s, "submit") or contains(%1$s, "button")) and contains(concat(\' \', normalize-space(string(@value)), \' \'), %2$s)) or (contains(%1$s, "image") and contains(concat(\' \', normalize-space(string(@alt)), \' \'), %2$s)) or @id=%3$s or @name=%3$s] | descendant-or-self::button[contains(concat(\' \', normalize-space(string(.)), \' \'), %2$s) or contains(concat(\' \', normalize-space(string(@value)), \' \'), %2$s) or @id=%3$s or @name=%3$s]', 'translate(@type, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz")', static::xpathLiteral(' '.$value.' '), static::xpathLiteral($value)) 756 ); 757 } 758 759 /** 760 * Returns a Link object for the first node in the list. 761 * 762 * @throws \InvalidArgumentException If the current node list is empty or the selected node is not instance of DOMElement 763 */ 764 public function link(string $method = 'get'): Link 765 { 766 if (!$this->nodes) { 767 throw new \InvalidArgumentException('The current node list is empty.'); 768 } 769 770 $node = $this->getNode(0); 771 772 if (!$node instanceof \DOMElement) { 773 throw new \InvalidArgumentException(\sprintf('The selected node should be instance of DOMElement, got "%s".', get_debug_type($node))); 774 } 775 776 return new Link($node, $this->baseHref, $method); 777 } 778 779 /** 780 * Returns an array of Link objects for the nodes in the list. 781 * 782 * @return Link[] 783 * 784 * @throws \InvalidArgumentException If the current node list contains non-DOMElement instances 785 */ 786 public function links(): array 787 { 788 $links = []; 789 foreach ($this->nodes as $node) { 790 if (!$node instanceof \DOMElement) { 791 throw new \InvalidArgumentException(\sprintf('The current node list should contain only DOMElement instances, "%s" found.', get_debug_type($node))); 792 } 793 794 $links[] = new Link($node, $this->baseHref, 'get'); 795 } 796 797 return $links; 798 } 799 800 /** 801 * Returns an Image object for the first node in the list. 802 * 803 * @throws \InvalidArgumentException If the current node list is empty 804 */ 805 public function image(): Image 806 { 807 if (!\count($this)) { 808 throw new \InvalidArgumentException('The current node list is empty.'); 809 } 810 811 $node = $this->getNode(0); 812 813 if (!$node instanceof \DOMElement) { 814 throw new \InvalidArgumentException(\sprintf('The selected node should be instance of DOMElement, got "%s".', get_debug_type($node))); 815 } 816 817 return new Image($node, $this->baseHref); 818 } 819 820 /** 821 * Returns an array of Image objects for the nodes in the list. 822 * 823 * @return Image[] 824 */ 825 public function images(): array 826 { 827 $images = []; 828 foreach ($this as $node) { 829 if (!$node instanceof \DOMElement) { 830 throw new \InvalidArgumentException(\sprintf('The current node list should contain only DOMElement instances, "%s" found.', get_debug_type($node))); 831 } 832 833 $images[] = new Image($node, $this->baseHref); 834 } 835 836 return $images; 837 } 838 839 /** 840 * Returns a Form object for the first node in the list. 841 * 842 * @throws \InvalidArgumentException If the current node list is empty or the selected node is not instance of DOMElement 843 */ 844 public function form(?array $values = null, ?string $method = null): Form 845 { 846 if (!$this->nodes) { 847 throw new \InvalidArgumentException('The current node list is empty.'); 848 } 849 850 $node = $this->getNode(0); 851 852 if (!$node instanceof \DOMElement) { 853 throw new \InvalidArgumentException(\sprintf('The selected node should be instance of DOMElement, got "%s".', get_debug_type($node))); 854 } 855 856 $form = new Form($node, $this->uri, $method, $this->baseHref); 857 858 if (null !== $values) { 859 $form->setValues($values); 860 } 861 862 return $form; 863 } 864 865 /** 866 * Overloads a default namespace prefix to be used with XPath and CSS expressions. 867 */ 868 public function setDefaultNamespacePrefix(string $prefix): void 869 { 870 $this->defaultNamespacePrefix = $prefix; 871 } 872 873 public function registerNamespace(string $prefix, string $namespace): void 874 { 875 $this->namespaces[$prefix] = $namespace; 876 } 877 878 /** 879 * Converts string for XPath expressions. 880 * 881 * Escaped characters are: quotes (") and apostrophe ('). 882 * 883 * Examples: 884 * 885 * echo Crawler::xpathLiteral('foo " bar'); 886 * //prints 'foo " bar' 887 * 888 * echo Crawler::xpathLiteral("foo ' bar"); 889 * //prints "foo ' bar" 890 * 891 * echo Crawler::xpathLiteral('a\'b"c'); 892 * //prints concat('a', "'", 'b"c') 893 */ 894 public static function xpathLiteral(string $s): string 895 { 896 if (!str_contains($s, "'")) { 897 return \sprintf("'%s'", $s); 898 } 899 900 if (!str_contains($s, '"')) { 901 return \sprintf('"%s"', $s); 902 } 903 904 $string = $s; 905 $parts = []; 906 while (true) { 907 if (false !== $pos = strpos($string, "'")) { 908 $parts[] = \sprintf("'%s'", substr($string, 0, $pos)); 909 $parts[] = "\"'\""; 910 $string = substr($string, $pos + 1); 911 } else { 912 $parts[] = "'$string'"; 913 break; 914 } 915 } 916 917 return \sprintf('concat(%s)', implode(', ', $parts)); 918 } 919 920 /** 921 * Filters the list of nodes with an XPath expression. 922 * 923 * The XPath expression should already be processed to apply it in the context of each node. 924 */ 925 private function filterRelativeXPath(string $xpath): static 926 { 927 $crawler = $this->createSubCrawler(null); 928 if (null === $this->document) { 929 return $crawler; 930 } 931 932 $domxpath = $this->createDOMXPath($this->document, $this->findNamespacePrefixes($xpath)); 933 934 foreach ($this->nodes as $node) { 935 $crawler->add($domxpath->query($xpath, $node)); 936 } 937 938 return $crawler; 939 } 940 941 /** 942 * Make the XPath relative to the current context. 943 * 944 * The returned XPath will match elements matching the XPath inside the current crawler 945 * when running in the context of a node of the crawler. 946 */ 947 private function relativize(string $xpath): string 948 { 949 $expressions = []; 950 951 // An expression which will never match to replace expressions which cannot match in the crawler 952 // We cannot drop 953 $nonMatchingExpression = 'a[name() = "b"]'; 954 955 $xpathLen = \strlen($xpath); 956 $openedBrackets = 0; 957 $startPosition = strspn($xpath, " \t\n\r\0\x0B"); 958 959 for ($i = $startPosition; $i <= $xpathLen; ++$i) { 960 $i += strcspn($xpath, '"\'[]|', $i); 961 962 if ($i < $xpathLen) { 963 switch ($xpath[$i]) { 964 case '"': 965 case "'": 966 if (false === $i = strpos($xpath, $xpath[$i], $i + 1)) { 967 return $xpath; // The XPath expression is invalid 968 } 969 continue 2; 970 case '[': 971 ++$openedBrackets; 972 continue 2; 973 case ']': 974 --$openedBrackets; 975 continue 2; 976 } 977 } 978 if ($openedBrackets) { 979 continue; 980 } 981 982 if ($startPosition < $xpathLen && '(' === $xpath[$startPosition]) { 983 // If the union is inside some braces, we need to preserve the opening braces and apply 984 // the change only inside it. 985 $j = 1 + strspn($xpath, "( \t\n\r\0\x0B", $startPosition + 1); 986 $parenthesis = substr($xpath, $startPosition, $j); 987 $startPosition += $j; 988 } else { 989 $parenthesis = ''; 990 } 991 $expression = rtrim(substr($xpath, $startPosition, $i - $startPosition)); 992 993 if (str_starts_with($expression, 'self::*/')) { 994 $expression = './'.substr($expression, 8); 995 } 996 997 // add prefix before absolute element selector 998 if ('' === $expression) { 999 $expression = $nonMatchingExpression; 1000 } elseif (str_starts_with($expression, '//')) { 1001 $expression = 'descendant-or-self::'.substr($expression, 2); 1002 } elseif (str_starts_with($expression, './/')) { 1003 $expression = 'descendant-or-self::'.substr($expression, 3); 1004 } elseif (str_starts_with($expression, './')) { 1005 $expression = 'self::'.substr($expression, 2); 1006 } elseif (str_starts_with($expression, 'child::')) { 1007 $expression = 'self::'.substr($expression, 7); 1008 } elseif ('/' === $expression[0] || '.' === $expression[0] || str_starts_with($expression, 'self::')) { 1009 $expression = $nonMatchingExpression; 1010 } elseif (str_starts_with($expression, 'descendant::')) { 1011 $expression = 'descendant-or-self::'.substr($expression, 12); 1012 } elseif (preg_match('/^(ancestor|ancestor-or-self|attribute|following|following-sibling|namespace|parent|preceding|preceding-sibling)::/', $expression)) { 1013 // the fake root has no parent, preceding or following nodes and also no attributes (even no namespace attributes) 1014 $expression = $nonMatchingExpression; 1015 } elseif (!str_starts_with($expression, 'descendant-or-self::')) { 1016 $expression = 'self::'.$expression; 1017 } 1018 $expressions[] = $parenthesis.$expression; 1019 1020 if ($i === $xpathLen) { 1021 return implode(' | ', $expressions); 1022 } 1023 1024 $i += strspn($xpath, " \t\n\r\0\x0B", $i + 1); 1025 $startPosition = $i + 1; 1026 } 1027 1028 return $xpath; // The XPath expression is invalid 1029 } 1030 1031 public function getNode(int $position): ?\DOMNode 1032 { 1033 return $this->nodes[$position] ?? null; 1034 } 1035 1036 public function count(): int 1037 { 1038 return \count($this->nodes); 1039 } 1040 1041 /** 1042 * @return \ArrayIterator<int, \DOMNode> 1043 */ 1044 public function getIterator(): \ArrayIterator 1045 { 1046 return new \ArrayIterator($this->nodes); 1047 } 1048 1049 protected function sibling(\DOMNode $node, string $siblingDir = 'nextSibling'): array 1050 { 1051 $nodes = []; 1052 1053 $currentNode = $this->getNode(0); 1054 do { 1055 if ($node !== $currentNode && \XML_ELEMENT_NODE === $node->nodeType) { 1056 $nodes[] = $node; 1057 } 1058 } while ($node = $node->$siblingDir); 1059 1060 return $nodes; 1061 } 1062 1063 private function parseHtml5(string $htmlContent, string $charset = 'UTF-8'): \DOMDocument 1064 { 1065 if (!$this->supportsEncoding($charset)) { 1066 $htmlContent = $this->convertToHtmlEntities($htmlContent, $charset); 1067 $charset = 'UTF-8'; 1068 } 1069 1070 return $this->html5Parser->parse($htmlContent, ['encoding' => $charset]); 1071 } 1072 1073 private function supportsEncoding(string $encoding): bool 1074 { 1075 try { 1076 return '' === @mb_convert_encoding('', $encoding, 'UTF-8'); 1077 } catch (\Throwable $e) { 1078 return false; 1079 } 1080 } 1081 1082 private function parseXhtml(string $htmlContent, string $charset = 'UTF-8'): \DOMDocument 1083 { 1084 if ('UTF-8' === $charset && preg_match('//u', $htmlContent)) { 1085 $htmlContent = '<?xml encoding="UTF-8">'.$htmlContent; 1086 } else { 1087 $htmlContent = $this->convertToHtmlEntities($htmlContent, $charset); 1088 } 1089 1090 $internalErrors = libxml_use_internal_errors(true); 1091 1092 $dom = new \DOMDocument('1.0', $charset); 1093 $dom->validateOnParse = true; 1094 1095 if ('' !== trim($htmlContent)) { 1096 @$dom->loadHTML($htmlContent); 1097 } 1098 1099 libxml_use_internal_errors($internalErrors); 1100 1101 return $dom; 1102 } 1103 1104 /** 1105 * Converts charset to HTML-entities to ensure valid parsing. 1106 */ 1107 private function convertToHtmlEntities(string $htmlContent, string $charset = 'UTF-8'): string 1108 { 1109 set_error_handler(static fn () => throw new \Exception()); 1110 1111 try { 1112 return mb_encode_numericentity($htmlContent, [0x80, 0x10FFFF, 0, 0x1FFFFF], $charset); 1113 } catch (\Exception|\ValueError) { 1114 try { 1115 $htmlContent = iconv($charset, 'UTF-8', $htmlContent); 1116 $htmlContent = mb_encode_numericentity($htmlContent, [0x80, 0x10FFFF, 0, 0x1FFFFF], 'UTF-8'); 1117 } catch (\Exception|\ValueError) { 1118 } 1119 1120 return $htmlContent; 1121 } finally { 1122 restore_error_handler(); 1123 } 1124 } 1125 1126 /** 1127 * @throws \InvalidArgumentException 1128 */ 1129 private function createDOMXPath(\DOMDocument $document, array $prefixes = []): \DOMXPath 1130 { 1131 $domxpath = new \DOMXPath($document); 1132 1133 foreach ($prefixes as $prefix) { 1134 $namespace = $this->discoverNamespace($domxpath, $prefix); 1135 if (null !== $namespace) { 1136 $domxpath->registerNamespace($prefix, $namespace); 1137 } 1138 } 1139 1140 return $domxpath; 1141 } 1142 1143 /** 1144 * @throws \InvalidArgumentException 1145 */ 1146 private function discoverNamespace(\DOMXPath $domxpath, string $prefix): ?string 1147 { 1148 if (\array_key_exists($prefix, $this->namespaces)) { 1149 return $this->namespaces[$prefix]; 1150 } 1151 1152 if ($this->cachedNamespaces->offsetExists($prefix)) { 1153 return $this->cachedNamespaces[$prefix]; 1154 } 1155 1156 // ask for one namespace, otherwise we'd get a collection with an item for each node 1157 $namespaces = $domxpath->query(\sprintf('(//namespace::*[name()="%s"])[last()]', $this->defaultNamespacePrefix === $prefix ? '' : $prefix)); 1158 1159 return $this->cachedNamespaces[$prefix] = ($node = $namespaces->item(0)) ? $node->nodeValue : null; 1160 } 1161 1162 private function findNamespacePrefixes(string $xpath): array 1163 { 1164 if (preg_match_all('/(?P<prefix>[a-z_][a-z_0-9\-\.]*+):[^"\/:]/i', $xpath, $matches)) { 1165 return array_unique($matches['prefix']); 1166 } 1167 1168 return []; 1169 } 1170 1171 /** 1172 * Creates a crawler for some subnodes. 1173 * 1174 * @param \DOMNodeList|\DOMNode|\DOMNode[]|string|null $nodes 1175 */ 1176 private function createSubCrawler(\DOMNodeList|\DOMNode|array|string|null $nodes): static 1177 { 1178 $crawler = new static($nodes, $this->uri, $this->baseHref); 1179 $crawler->isHtml = $this->isHtml; 1180 $crawler->document = $this->document; 1181 $crawler->namespaces = $this->namespaces; 1182 $crawler->cachedNamespaces = $this->cachedNamespaces; 1183 $crawler->html5Parser = $this->html5Parser; 1184 1185 return $crawler; 1186 } 1187 1188 /** 1189 * @throws \LogicException If the CssSelector Component is not available 1190 */ 1191 private function createCssSelectorConverter(): CssSelectorConverter 1192 { 1193 if (!class_exists(CssSelectorConverter::class)) { 1194 throw new \LogicException('To filter with a CSS selector, install the CssSelector component ("composer require symfony/css-selector"). Or use filterXpath instead.'); 1195 } 1196 1197 return new CssSelectorConverter($this->isHtml); 1198 } 1199 1200 /** 1201 * Parse string into DOMDocument object using HTML5 parser if the content is HTML5 and the library is available. 1202 * Use libxml parser otherwise. 1203 */ 1204 private function parseHtmlString(string $content, string $charset): \DOMDocument 1205 { 1206 if ($this->canParseHtml5String($content)) { 1207 return $this->parseHtml5($content, $charset); 1208 } 1209 1210 return $this->parseXhtml($content, $charset); 1211 } 1212 1213 private function canParseHtml5String(string $content): bool 1214 { 1215 if (!$this->html5Parser) { 1216 return false; 1217 } 1218 1219 if (false === ($pos = stripos($content, '<!doctype html>'))) { 1220 return false; 1221 } 1222 1223 $header = substr($content, 0, $pos); 1224 1225 return '' === $header || $this->isValidHtml5Heading($header); 1226 } 1227 1228 private function isValidHtml5Heading(string $heading): bool 1229 { 1230 return 1 === preg_match('/^\x{FEFF}?\s*(<!--[^>]*?-->\s*)*$/u', $heading); 1231 } 1232 1233 private function normalizeWhitespace(string $string): string 1234 { 1235 return trim(preg_replace("/(?:[ \n\r\t\x0C]{2,}+|[\n\r\t\x0C])/", ' ', $string), " \n\r\t\x0C"); 1236 } 1237}