friendship ended with social-app. php is my new best friend
1<?php
2
3/*
4 * This file is part of the Symfony package.
5 *
6 * (c) Fabien Potencier <fabien@symfony.com>
7 *
8 * For the full copyright and license information, please view the LICENSE
9 * file that was distributed with this source code.
10 */
11
12namespace Symfony\Component\DomCrawler;
13
14use Masterminds\HTML5;
15use Symfony\Component\CssSelector\CssSelectorConverter;
16
17/**
18 * Crawler eases navigation of a list of \DOMNode objects.
19 *
20 * @author Fabien Potencier <fabien@symfony.com>
21 *
22 * @implements \IteratorAggregate<int, \DOMNode>
23 */
24class Crawler implements \Countable, \IteratorAggregate
25{
26 /**
27 * The default namespace prefix to be used with XPath and CSS expressions.
28 */
29 private string $defaultNamespacePrefix = 'default';
30
31 /**
32 * A map of manually registered namespaces.
33 *
34 * @var array<string, string>
35 */
36 private array $namespaces = [];
37
38 /**
39 * A map of cached namespaces.
40 */
41 private \ArrayObject $cachedNamespaces;
42
43 private ?string $baseHref;
44 private ?\DOMDocument $document = null;
45
46 /**
47 * @var list<\DOMNode>
48 */
49 private array $nodes = [];
50
51 /**
52 * Whether the Crawler contains HTML or XML content (used when converting CSS to XPath).
53 */
54 private bool $isHtml = true;
55
56 private ?HTML5 $html5Parser = null;
57
58 /**
59 * @param \DOMNodeList|\DOMNode|\DOMNode[]|string|null $node A Node to use as the base for the crawling
60 */
61 public function __construct(
62 \DOMNodeList|\DOMNode|array|string|null $node = null,
63 protected ?string $uri = null,
64 ?string $baseHref = null,
65 bool $useHtml5Parser = true,
66 ) {
67 $this->baseHref = $baseHref ?: $uri;
68 $this->html5Parser = $useHtml5Parser ? new HTML5(['disable_html_ns' => true]) : null;
69 $this->cachedNamespaces = new \ArrayObject();
70
71 $this->add($node);
72 }
73
74 /**
75 * Returns the current URI.
76 */
77 public function getUri(): ?string
78 {
79 return $this->uri;
80 }
81
82 /**
83 * Returns base href.
84 */
85 public function getBaseHref(): ?string
86 {
87 return $this->baseHref;
88 }
89
90 /**
91 * Removes all the nodes.
92 */
93 public function clear(): void
94 {
95 $this->nodes = [];
96 $this->document = null;
97 $this->cachedNamespaces = new \ArrayObject();
98 }
99
100 /**
101 * Adds a node to the current list of nodes.
102 *
103 * This method uses the appropriate specialized add*() method based
104 * on the type of the argument.
105 *
106 * @param \DOMNodeList|\DOMNode|\DOMNode[]|string|null $node A node
107 *
108 * @throws \InvalidArgumentException when node is not the expected type
109 */
110 public function add(\DOMNodeList|\DOMNode|array|string|null $node): void
111 {
112 if ($node instanceof \DOMNodeList) {
113 $this->addNodeList($node);
114 } elseif ($node instanceof \DOMNode) {
115 $this->addNode($node);
116 } elseif (\is_array($node)) {
117 $this->addNodes($node);
118 } elseif (\is_string($node)) {
119 $this->addContent($node);
120 } elseif (null !== $node) {
121 throw new \InvalidArgumentException(\sprintf('Expecting a DOMNodeList or DOMNode instance, an array, a string, or null, but got "%s".', get_debug_type($node)));
122 }
123 }
124
125 /**
126 * Adds HTML/XML content.
127 *
128 * If the charset is not set via the content type, it is assumed to be UTF-8,
129 * or ISO-8859-1 as a fallback, which is the default charset defined by the
130 * HTTP 1.1 specification.
131 */
132 public function addContent(string $content, ?string $type = null): void
133 {
134 if (!$type) {
135 $type = str_starts_with($content, '<?xml') ? 'application/xml' : 'text/html';
136 }
137
138 // DOM only for HTML/XML content
139 if (!preg_match('/(x|ht)ml/i', $type, $xmlMatches)) {
140 return;
141 }
142
143 $charset = preg_match('//u', $content) ? 'UTF-8' : 'ISO-8859-1';
144
145 // http://www.w3.org/TR/encoding/#encodings
146 // http://www.w3.org/TR/REC-xml/#NT-EncName
147 $content = preg_replace_callback('/(charset *= *["\']?)([a-zA-Z\-0-9_:.]+)/i', function ($m) use (&$charset) {
148 if ('charset=' === $this->convertToHtmlEntities('charset=', $m[2])) {
149 $charset = $m[2];
150 }
151
152 return $m[1].$charset;
153 }, $content, 1);
154
155 if ('x' === $xmlMatches[1]) {
156 $this->addXmlContent($content, $charset);
157 } else {
158 $this->addHtmlContent($content, $charset);
159 }
160 }
161
162 /**
163 * Adds an HTML content to the list of nodes.
164 *
165 * The libxml errors are disabled when the content is parsed.
166 *
167 * If you want to get parsing errors, be sure to enable
168 * internal errors via libxml_use_internal_errors(true)
169 * and then, get the errors via libxml_get_errors(). Be
170 * sure to clear errors with libxml_clear_errors() afterward.
171 */
172 public function addHtmlContent(string $content, string $charset = 'UTF-8'): void
173 {
174 $dom = $this->parseHtmlString($content, $charset);
175 $this->addDocument($dom);
176
177 $base = $this->filterRelativeXPath('descendant-or-self::base')->extract(['href']);
178
179 $baseHref = current($base);
180 if (\count($base) && $baseHref) {
181 if ($this->baseHref) {
182 $linkNode = $dom->createElement('a');
183 $linkNode->setAttribute('href', $baseHref);
184 $link = new Link($linkNode, $this->baseHref);
185 $this->baseHref = $link->getUri();
186 } else {
187 $this->baseHref = $baseHref;
188 }
189 }
190 }
191
192 /**
193 * Adds an XML content to the list of nodes.
194 *
195 * The libxml errors are disabled when the content is parsed.
196 *
197 * If you want to get parsing errors, be sure to enable
198 * internal errors via libxml_use_internal_errors(true)
199 * and then, get the errors via libxml_get_errors(). Be
200 * sure to clear errors with libxml_clear_errors() afterward.
201 *
202 * @param int $options Bitwise OR of the libxml option constants
203 * LIBXML_PARSEHUGE is dangerous, see
204 * http://symfony.com/blog/security-release-symfony-2-0-17-released
205 */
206 public function addXmlContent(string $content, string $charset = 'UTF-8', int $options = \LIBXML_NONET): void
207 {
208 // remove the default namespace if it's the only namespace to make XPath expressions simpler
209 if (!str_contains($content, 'xmlns:')) {
210 $content = str_replace('xmlns', 'ns', $content);
211 }
212
213 $internalErrors = libxml_use_internal_errors(true);
214
215 $dom = new \DOMDocument('1.0', $charset);
216 $dom->validateOnParse = true;
217
218 if ('' !== trim($content)) {
219 @$dom->loadXML($content, $options);
220 }
221
222 libxml_use_internal_errors($internalErrors);
223
224 $this->addDocument($dom);
225
226 $this->isHtml = false;
227 }
228
229 /**
230 * Adds a \DOMDocument to the list of nodes.
231 *
232 * @param \DOMDocument $dom A \DOMDocument instance
233 */
234 public function addDocument(\DOMDocument $dom): void
235 {
236 if ($dom->documentElement) {
237 $this->addNode($dom->documentElement);
238 }
239 }
240
241 /**
242 * Adds a \DOMNodeList to the list of nodes.
243 *
244 * @param \DOMNodeList $nodes A \DOMNodeList instance
245 */
246 public function addNodeList(\DOMNodeList $nodes): void
247 {
248 foreach ($nodes as $node) {
249 if ($node instanceof \DOMNode) {
250 $this->addNode($node);
251 }
252 }
253 }
254
255 /**
256 * Adds an array of \DOMNode instances to the list of nodes.
257 *
258 * @param \DOMNode[] $nodes An array of \DOMNode instances
259 */
260 public function addNodes(array $nodes): void
261 {
262 foreach ($nodes as $node) {
263 $this->add($node);
264 }
265 }
266
267 /**
268 * Adds a \DOMNode instance to the list of nodes.
269 *
270 * @param \DOMNode $node A \DOMNode instance
271 */
272 public function addNode(\DOMNode $node): void
273 {
274 if ($node instanceof \DOMDocument) {
275 $node = $node->documentElement;
276 }
277
278 if (null !== $this->document && $this->document !== $node->ownerDocument) {
279 throw new \InvalidArgumentException('Attaching DOM nodes from multiple documents in the same crawler is forbidden.');
280 }
281
282 $this->document ??= $node->ownerDocument;
283
284 // Don't add duplicate nodes in the Crawler
285 if (\in_array($node, $this->nodes, true)) {
286 return;
287 }
288
289 $this->nodes[] = $node;
290 }
291
292 /**
293 * Returns a node given its position in the node list.
294 */
295 public function eq(int $position): static
296 {
297 if (isset($this->nodes[$position])) {
298 return $this->createSubCrawler($this->nodes[$position]);
299 }
300
301 return $this->createSubCrawler(null);
302 }
303
304 /**
305 * Calls an anonymous function on each node of the list.
306 *
307 * The anonymous function receives the position and the node wrapped
308 * in a Crawler instance as arguments.
309 *
310 * Example:
311 *
312 * $crawler->filter('h1')->each(function ($node, $i) {
313 * return $node->text();
314 * });
315 *
316 * @param \Closure $closure An anonymous function
317 *
318 * @return array An array of values returned by the anonymous function
319 */
320 public function each(\Closure $closure): array
321 {
322 $data = [];
323 foreach ($this->nodes as $i => $node) {
324 $data[] = $closure($this->createSubCrawler($node), $i);
325 }
326
327 return $data;
328 }
329
330 /**
331 * Slices the list of nodes by $offset and $length.
332 */
333 public function slice(int $offset = 0, ?int $length = null): static
334 {
335 return $this->createSubCrawler(\array_slice($this->nodes, $offset, $length));
336 }
337
338 /**
339 * Reduces the list of nodes by calling an anonymous function.
340 *
341 * To remove a node from the list, the anonymous function must return false.
342 *
343 * @param \Closure $closure An anonymous function
344 */
345 public function reduce(\Closure $closure): static
346 {
347 $nodes = [];
348 foreach ($this->nodes as $i => $node) {
349 if (false !== $closure($this->createSubCrawler($node), $i)) {
350 $nodes[] = $node;
351 }
352 }
353
354 return $this->createSubCrawler($nodes);
355 }
356
357 /**
358 * Returns the first node of the current selection.
359 */
360 public function first(): static
361 {
362 return $this->eq(0);
363 }
364
365 /**
366 * Returns the last node of the current selection.
367 */
368 public function last(): static
369 {
370 return $this->eq(\count($this->nodes) - 1);
371 }
372
373 /**
374 * Returns the siblings nodes of the current selection.
375 *
376 * @throws \InvalidArgumentException When current node is empty
377 */
378 public function siblings(): static
379 {
380 if (!$this->nodes) {
381 throw new \InvalidArgumentException('The current node list is empty.');
382 }
383
384 return $this->createSubCrawler($this->sibling($this->getNode(0)->parentNode->firstChild));
385 }
386
387 public function matches(string $selector): bool
388 {
389 if (!$this->nodes) {
390 return false;
391 }
392
393 $converter = $this->createCssSelectorConverter();
394 $xpath = $converter->toXPath($selector, 'self::');
395
396 return 0 !== $this->filterRelativeXPath($xpath)->count();
397 }
398
399 /**
400 * Return first parents (heading toward the document root) of the Element that matches the provided selector.
401 *
402 * @see https://developer.mozilla.org/en-US/docs/Web/API/Element/closest#Polyfill
403 *
404 * @throws \InvalidArgumentException When current node is empty
405 */
406 public function closest(string $selector): ?self
407 {
408 if (!$this->nodes) {
409 throw new \InvalidArgumentException('The current node list is empty.');
410 }
411
412 $domNode = $this->getNode(0);
413
414 while (null !== $domNode && \XML_ELEMENT_NODE === $domNode->nodeType) {
415 $node = $this->createSubCrawler($domNode);
416 if ($node->matches($selector)) {
417 return $node;
418 }
419
420 $domNode = $node->getNode(0)->parentNode;
421 }
422
423 return null;
424 }
425
426 /**
427 * Returns the next siblings nodes of the current selection.
428 *
429 * @throws \InvalidArgumentException When current node is empty
430 */
431 public function nextAll(): static
432 {
433 if (!$this->nodes) {
434 throw new \InvalidArgumentException('The current node list is empty.');
435 }
436
437 return $this->createSubCrawler($this->sibling($this->getNode(0)));
438 }
439
440 /**
441 * Returns the previous sibling nodes of the current selection.
442 *
443 * @throws \InvalidArgumentException
444 */
445 public function previousAll(): static
446 {
447 if (!$this->nodes) {
448 throw new \InvalidArgumentException('The current node list is empty.');
449 }
450
451 return $this->createSubCrawler($this->sibling($this->getNode(0), 'previousSibling'));
452 }
453
454 /**
455 * Returns the ancestors of the current selection.
456 *
457 * @throws \InvalidArgumentException When the current node is empty
458 */
459 public function ancestors(): static
460 {
461 if (!$this->nodes) {
462 throw new \InvalidArgumentException('The current node list is empty.');
463 }
464
465 $node = $this->getNode(0);
466 $nodes = [];
467
468 while ($node = $node->parentNode) {
469 if (\XML_ELEMENT_NODE === $node->nodeType) {
470 $nodes[] = $node;
471 }
472 }
473
474 return $this->createSubCrawler($nodes);
475 }
476
477 /**
478 * Returns the children nodes of the current selection.
479 *
480 * @throws \InvalidArgumentException When current node is empty
481 * @throws \RuntimeException If the CssSelector Component is not available and $selector is provided
482 */
483 public function children(?string $selector = null): static
484 {
485 if (!$this->nodes) {
486 throw new \InvalidArgumentException('The current node list is empty.');
487 }
488
489 if (null !== $selector) {
490 $converter = $this->createCssSelectorConverter();
491 $xpath = $converter->toXPath($selector, 'child::');
492
493 return $this->filterRelativeXPath($xpath);
494 }
495
496 $node = $this->getNode(0)->firstChild;
497
498 return $this->createSubCrawler($node ? $this->sibling($node) : []);
499 }
500
501 /**
502 * Returns the attribute value of the first node of the list.
503 *
504 * @param string|null $default When not null: the value to return when the node or attribute is empty
505 *
506 * @throws \InvalidArgumentException When current node is empty
507 */
508 public function attr(string $attribute, ?string $default = null): ?string
509 {
510 if (!$this->nodes) {
511 if (null !== $default) {
512 return $default;
513 }
514
515 throw new \InvalidArgumentException('The current node list is empty.');
516 }
517
518 $node = $this->getNode(0);
519
520 return $node->hasAttribute($attribute) ? $node->getAttribute($attribute) : $default;
521 }
522
523 /**
524 * Returns the node name of the first node of the list.
525 *
526 * @throws \InvalidArgumentException When current node is empty
527 */
528 public function nodeName(): string
529 {
530 if (!$this->nodes) {
531 throw new \InvalidArgumentException('The current node list is empty.');
532 }
533
534 return $this->getNode(0)->nodeName;
535 }
536
537 /**
538 * Returns the text of the first node of the list.
539 *
540 * Pass true as the second argument to normalize whitespaces.
541 *
542 * @param string|null $default When not null: the value to return when the current node is empty
543 * @param bool $normalizeWhitespace Whether whitespaces should be trimmed and normalized to single spaces
544 *
545 * @throws \InvalidArgumentException When current node is empty
546 */
547 public function text(?string $default = null, bool $normalizeWhitespace = true): string
548 {
549 if (!$this->nodes) {
550 if (null !== $default) {
551 return $default;
552 }
553
554 throw new \InvalidArgumentException('The current node list is empty.');
555 }
556
557 $text = $this->getNode(0)->nodeValue;
558
559 if ($normalizeWhitespace) {
560 return $this->normalizeWhitespace($text);
561 }
562
563 return $text;
564 }
565
566 /**
567 * Returns only the inner text that is the direct descendent of the current node, excluding any child nodes.
568 *
569 * @param bool $normalizeWhitespace Whether whitespaces should be trimmed and normalized to single spaces
570 */
571 public function innerText(bool $normalizeWhitespace = true): string
572 {
573 foreach ($this->getNode(0)->childNodes as $childNode) {
574 if (\XML_TEXT_NODE !== $childNode->nodeType && \XML_CDATA_SECTION_NODE !== $childNode->nodeType) {
575 continue;
576 }
577 if (!$normalizeWhitespace) {
578 return $childNode->nodeValue;
579 }
580 if ('' !== trim($childNode->nodeValue)) {
581 return $this->normalizeWhitespace($childNode->nodeValue);
582 }
583 }
584
585 return '';
586 }
587
588 /**
589 * Returns the first node of the list as HTML.
590 *
591 * @param string|null $default When not null: the value to return when the current node is empty
592 *
593 * @throws \InvalidArgumentException When current node is empty
594 */
595 public function html(?string $default = null): string
596 {
597 if (!$this->nodes) {
598 if (null !== $default) {
599 return $default;
600 }
601
602 throw new \InvalidArgumentException('The current node list is empty.');
603 }
604
605 $node = $this->getNode(0);
606 $owner = $node->ownerDocument;
607
608 if ($this->html5Parser && '<!DOCTYPE html>' === $owner->saveXML($owner->childNodes[0])) {
609 $owner = $this->html5Parser;
610 }
611
612 $html = '';
613 foreach ($node->childNodes as $child) {
614 $html .= $owner->saveHTML($child);
615 }
616
617 return $html;
618 }
619
620 public function outerHtml(): string
621 {
622 if (!\count($this)) {
623 throw new \InvalidArgumentException('The current node list is empty.');
624 }
625
626 $node = $this->getNode(0);
627 $owner = $node->ownerDocument;
628
629 if ($this->html5Parser && '<!DOCTYPE html>' === $owner->saveXML($owner->childNodes[0])) {
630 $owner = $this->html5Parser;
631 }
632
633 return $owner->saveHTML($node);
634 }
635
636 /**
637 * Evaluates an XPath expression.
638 *
639 * Since an XPath expression might evaluate to either a simple type or a \DOMNodeList,
640 * this method will return either an array of simple types or a new Crawler instance.
641 */
642 public function evaluate(string $xpath): array|self
643 {
644 if (null === $this->document) {
645 throw new \LogicException('Cannot evaluate the expression on an uninitialized crawler.');
646 }
647
648 $data = [];
649 $domxpath = $this->createDOMXPath($this->document, $this->findNamespacePrefixes($xpath));
650
651 foreach ($this->nodes as $node) {
652 $data[] = $domxpath->evaluate($xpath, $node);
653 }
654
655 if (isset($data[0]) && $data[0] instanceof \DOMNodeList) {
656 return $this->createSubCrawler($data);
657 }
658
659 return $data;
660 }
661
662 /**
663 * Extracts information from the list of nodes.
664 *
665 * You can extract attributes or/and the node value (_text).
666 *
667 * Example:
668 *
669 * $crawler->filter('h1 a')->extract(['_text', 'href']);
670 */
671 public function extract(array $attributes): array
672 {
673 $count = \count($attributes);
674
675 $data = [];
676 foreach ($this->nodes as $node) {
677 $elements = [];
678 foreach ($attributes as $attribute) {
679 if ('_text' === $attribute) {
680 $elements[] = $node->nodeValue;
681 } elseif ('_name' === $attribute) {
682 $elements[] = $node->nodeName;
683 } else {
684 $elements[] = $node->getAttribute($attribute);
685 }
686 }
687
688 $data[] = 1 === $count ? $elements[0] : $elements;
689 }
690
691 return $data;
692 }
693
694 /**
695 * Filters the list of nodes with an XPath expression.
696 *
697 * The XPath expression is evaluated in the context of the crawler, which
698 * is considered as a fake parent of the elements inside it.
699 * This means that a child selector "div" or "./div" will match only
700 * the div elements of the current crawler, not their children.
701 */
702 public function filterXPath(string $xpath): static
703 {
704 $xpath = $this->relativize($xpath);
705
706 // If we dropped all expressions in the XPath while preparing it, there would be no match
707 if ('' === $xpath) {
708 return $this->createSubCrawler(null);
709 }
710
711 return $this->filterRelativeXPath($xpath);
712 }
713
714 /**
715 * Filters the list of nodes with a CSS selector.
716 *
717 * This method only works if you have installed the CssSelector Symfony Component.
718 *
719 * @throws \LogicException if the CssSelector Component is not available
720 */
721 public function filter(string $selector): static
722 {
723 $converter = $this->createCssSelectorConverter();
724
725 // The CssSelector already prefixes the selector with descendant-or-self::
726 return $this->filterRelativeXPath($converter->toXPath($selector));
727 }
728
729 /**
730 * Selects links by name or alt value for clickable images.
731 */
732 public function selectLink(string $value): static
733 {
734 return $this->filterRelativeXPath(
735 \sprintf('descendant-or-self::a[contains(concat(\' \', normalize-space(string(.)), \' \'), %1$s) or ./img[contains(concat(\' \', normalize-space(string(@alt)), \' \'), %1$s)]]', static::xpathLiteral(' '.$value.' '))
736 );
737 }
738
739 /**
740 * Selects images by alt value.
741 */
742 public function selectImage(string $value): static
743 {
744 $xpath = \sprintf('descendant-or-self::img[contains(normalize-space(string(@alt)), %s)]', static::xpathLiteral($value));
745
746 return $this->filterRelativeXPath($xpath);
747 }
748
749 /**
750 * Selects a button by its text content, id, value, name or alt attribute.
751 */
752 public function selectButton(string $value): static
753 {
754 return $this->filterRelativeXPath(
755 \sprintf('descendant-or-self::input[((contains(%1$s, "submit") or contains(%1$s, "button")) and contains(concat(\' \', normalize-space(string(@value)), \' \'), %2$s)) or (contains(%1$s, "image") and contains(concat(\' \', normalize-space(string(@alt)), \' \'), %2$s)) or @id=%3$s or @name=%3$s] | descendant-or-self::button[contains(concat(\' \', normalize-space(string(.)), \' \'), %2$s) or contains(concat(\' \', normalize-space(string(@value)), \' \'), %2$s) or @id=%3$s or @name=%3$s]', 'translate(@type, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz")', static::xpathLiteral(' '.$value.' '), static::xpathLiteral($value))
756 );
757 }
758
759 /**
760 * Returns a Link object for the first node in the list.
761 *
762 * @throws \InvalidArgumentException If the current node list is empty or the selected node is not instance of DOMElement
763 */
764 public function link(string $method = 'get'): Link
765 {
766 if (!$this->nodes) {
767 throw new \InvalidArgumentException('The current node list is empty.');
768 }
769
770 $node = $this->getNode(0);
771
772 if (!$node instanceof \DOMElement) {
773 throw new \InvalidArgumentException(\sprintf('The selected node should be instance of DOMElement, got "%s".', get_debug_type($node)));
774 }
775
776 return new Link($node, $this->baseHref, $method);
777 }
778
779 /**
780 * Returns an array of Link objects for the nodes in the list.
781 *
782 * @return Link[]
783 *
784 * @throws \InvalidArgumentException If the current node list contains non-DOMElement instances
785 */
786 public function links(): array
787 {
788 $links = [];
789 foreach ($this->nodes as $node) {
790 if (!$node instanceof \DOMElement) {
791 throw new \InvalidArgumentException(\sprintf('The current node list should contain only DOMElement instances, "%s" found.', get_debug_type($node)));
792 }
793
794 $links[] = new Link($node, $this->baseHref, 'get');
795 }
796
797 return $links;
798 }
799
800 /**
801 * Returns an Image object for the first node in the list.
802 *
803 * @throws \InvalidArgumentException If the current node list is empty
804 */
805 public function image(): Image
806 {
807 if (!\count($this)) {
808 throw new \InvalidArgumentException('The current node list is empty.');
809 }
810
811 $node = $this->getNode(0);
812
813 if (!$node instanceof \DOMElement) {
814 throw new \InvalidArgumentException(\sprintf('The selected node should be instance of DOMElement, got "%s".', get_debug_type($node)));
815 }
816
817 return new Image($node, $this->baseHref);
818 }
819
820 /**
821 * Returns an array of Image objects for the nodes in the list.
822 *
823 * @return Image[]
824 */
825 public function images(): array
826 {
827 $images = [];
828 foreach ($this as $node) {
829 if (!$node instanceof \DOMElement) {
830 throw new \InvalidArgumentException(\sprintf('The current node list should contain only DOMElement instances, "%s" found.', get_debug_type($node)));
831 }
832
833 $images[] = new Image($node, $this->baseHref);
834 }
835
836 return $images;
837 }
838
839 /**
840 * Returns a Form object for the first node in the list.
841 *
842 * @throws \InvalidArgumentException If the current node list is empty or the selected node is not instance of DOMElement
843 */
844 public function form(?array $values = null, ?string $method = null): Form
845 {
846 if (!$this->nodes) {
847 throw new \InvalidArgumentException('The current node list is empty.');
848 }
849
850 $node = $this->getNode(0);
851
852 if (!$node instanceof \DOMElement) {
853 throw new \InvalidArgumentException(\sprintf('The selected node should be instance of DOMElement, got "%s".', get_debug_type($node)));
854 }
855
856 $form = new Form($node, $this->uri, $method, $this->baseHref);
857
858 if (null !== $values) {
859 $form->setValues($values);
860 }
861
862 return $form;
863 }
864
865 /**
866 * Overloads a default namespace prefix to be used with XPath and CSS expressions.
867 */
868 public function setDefaultNamespacePrefix(string $prefix): void
869 {
870 $this->defaultNamespacePrefix = $prefix;
871 }
872
873 public function registerNamespace(string $prefix, string $namespace): void
874 {
875 $this->namespaces[$prefix] = $namespace;
876 }
877
878 /**
879 * Converts string for XPath expressions.
880 *
881 * Escaped characters are: quotes (") and apostrophe (').
882 *
883 * Examples:
884 *
885 * echo Crawler::xpathLiteral('foo " bar');
886 * //prints 'foo " bar'
887 *
888 * echo Crawler::xpathLiteral("foo ' bar");
889 * //prints "foo ' bar"
890 *
891 * echo Crawler::xpathLiteral('a\'b"c');
892 * //prints concat('a', "'", 'b"c')
893 */
894 public static function xpathLiteral(string $s): string
895 {
896 if (!str_contains($s, "'")) {
897 return \sprintf("'%s'", $s);
898 }
899
900 if (!str_contains($s, '"')) {
901 return \sprintf('"%s"', $s);
902 }
903
904 $string = $s;
905 $parts = [];
906 while (true) {
907 if (false !== $pos = strpos($string, "'")) {
908 $parts[] = \sprintf("'%s'", substr($string, 0, $pos));
909 $parts[] = "\"'\"";
910 $string = substr($string, $pos + 1);
911 } else {
912 $parts[] = "'$string'";
913 break;
914 }
915 }
916
917 return \sprintf('concat(%s)', implode(', ', $parts));
918 }
919
920 /**
921 * Filters the list of nodes with an XPath expression.
922 *
923 * The XPath expression should already be processed to apply it in the context of each node.
924 */
925 private function filterRelativeXPath(string $xpath): static
926 {
927 $crawler = $this->createSubCrawler(null);
928 if (null === $this->document) {
929 return $crawler;
930 }
931
932 $domxpath = $this->createDOMXPath($this->document, $this->findNamespacePrefixes($xpath));
933
934 foreach ($this->nodes as $node) {
935 $crawler->add($domxpath->query($xpath, $node));
936 }
937
938 return $crawler;
939 }
940
941 /**
942 * Make the XPath relative to the current context.
943 *
944 * The returned XPath will match elements matching the XPath inside the current crawler
945 * when running in the context of a node of the crawler.
946 */
947 private function relativize(string $xpath): string
948 {
949 $expressions = [];
950
951 // An expression which will never match to replace expressions which cannot match in the crawler
952 // We cannot drop
953 $nonMatchingExpression = 'a[name() = "b"]';
954
955 $xpathLen = \strlen($xpath);
956 $openedBrackets = 0;
957 $startPosition = strspn($xpath, " \t\n\r\0\x0B");
958
959 for ($i = $startPosition; $i <= $xpathLen; ++$i) {
960 $i += strcspn($xpath, '"\'[]|', $i);
961
962 if ($i < $xpathLen) {
963 switch ($xpath[$i]) {
964 case '"':
965 case "'":
966 if (false === $i = strpos($xpath, $xpath[$i], $i + 1)) {
967 return $xpath; // The XPath expression is invalid
968 }
969 continue 2;
970 case '[':
971 ++$openedBrackets;
972 continue 2;
973 case ']':
974 --$openedBrackets;
975 continue 2;
976 }
977 }
978 if ($openedBrackets) {
979 continue;
980 }
981
982 if ($startPosition < $xpathLen && '(' === $xpath[$startPosition]) {
983 // If the union is inside some braces, we need to preserve the opening braces and apply
984 // the change only inside it.
985 $j = 1 + strspn($xpath, "( \t\n\r\0\x0B", $startPosition + 1);
986 $parenthesis = substr($xpath, $startPosition, $j);
987 $startPosition += $j;
988 } else {
989 $parenthesis = '';
990 }
991 $expression = rtrim(substr($xpath, $startPosition, $i - $startPosition));
992
993 if (str_starts_with($expression, 'self::*/')) {
994 $expression = './'.substr($expression, 8);
995 }
996
997 // add prefix before absolute element selector
998 if ('' === $expression) {
999 $expression = $nonMatchingExpression;
1000 } elseif (str_starts_with($expression, '//')) {
1001 $expression = 'descendant-or-self::'.substr($expression, 2);
1002 } elseif (str_starts_with($expression, './/')) {
1003 $expression = 'descendant-or-self::'.substr($expression, 3);
1004 } elseif (str_starts_with($expression, './')) {
1005 $expression = 'self::'.substr($expression, 2);
1006 } elseif (str_starts_with($expression, 'child::')) {
1007 $expression = 'self::'.substr($expression, 7);
1008 } elseif ('/' === $expression[0] || '.' === $expression[0] || str_starts_with($expression, 'self::')) {
1009 $expression = $nonMatchingExpression;
1010 } elseif (str_starts_with($expression, 'descendant::')) {
1011 $expression = 'descendant-or-self::'.substr($expression, 12);
1012 } elseif (preg_match('/^(ancestor|ancestor-or-self|attribute|following|following-sibling|namespace|parent|preceding|preceding-sibling)::/', $expression)) {
1013 // the fake root has no parent, preceding or following nodes and also no attributes (even no namespace attributes)
1014 $expression = $nonMatchingExpression;
1015 } elseif (!str_starts_with($expression, 'descendant-or-self::')) {
1016 $expression = 'self::'.$expression;
1017 }
1018 $expressions[] = $parenthesis.$expression;
1019
1020 if ($i === $xpathLen) {
1021 return implode(' | ', $expressions);
1022 }
1023
1024 $i += strspn($xpath, " \t\n\r\0\x0B", $i + 1);
1025 $startPosition = $i + 1;
1026 }
1027
1028 return $xpath; // The XPath expression is invalid
1029 }
1030
1031 public function getNode(int $position): ?\DOMNode
1032 {
1033 return $this->nodes[$position] ?? null;
1034 }
1035
1036 public function count(): int
1037 {
1038 return \count($this->nodes);
1039 }
1040
1041 /**
1042 * @return \ArrayIterator<int, \DOMNode>
1043 */
1044 public function getIterator(): \ArrayIterator
1045 {
1046 return new \ArrayIterator($this->nodes);
1047 }
1048
1049 protected function sibling(\DOMNode $node, string $siblingDir = 'nextSibling'): array
1050 {
1051 $nodes = [];
1052
1053 $currentNode = $this->getNode(0);
1054 do {
1055 if ($node !== $currentNode && \XML_ELEMENT_NODE === $node->nodeType) {
1056 $nodes[] = $node;
1057 }
1058 } while ($node = $node->$siblingDir);
1059
1060 return $nodes;
1061 }
1062
1063 private function parseHtml5(string $htmlContent, string $charset = 'UTF-8'): \DOMDocument
1064 {
1065 if (!$this->supportsEncoding($charset)) {
1066 $htmlContent = $this->convertToHtmlEntities($htmlContent, $charset);
1067 $charset = 'UTF-8';
1068 }
1069
1070 return $this->html5Parser->parse($htmlContent, ['encoding' => $charset]);
1071 }
1072
1073 private function supportsEncoding(string $encoding): bool
1074 {
1075 try {
1076 return '' === @mb_convert_encoding('', $encoding, 'UTF-8');
1077 } catch (\Throwable $e) {
1078 return false;
1079 }
1080 }
1081
1082 private function parseXhtml(string $htmlContent, string $charset = 'UTF-8'): \DOMDocument
1083 {
1084 if ('UTF-8' === $charset && preg_match('//u', $htmlContent)) {
1085 $htmlContent = '<?xml encoding="UTF-8">'.$htmlContent;
1086 } else {
1087 $htmlContent = $this->convertToHtmlEntities($htmlContent, $charset);
1088 }
1089
1090 $internalErrors = libxml_use_internal_errors(true);
1091
1092 $dom = new \DOMDocument('1.0', $charset);
1093 $dom->validateOnParse = true;
1094
1095 if ('' !== trim($htmlContent)) {
1096 @$dom->loadHTML($htmlContent);
1097 }
1098
1099 libxml_use_internal_errors($internalErrors);
1100
1101 return $dom;
1102 }
1103
1104 /**
1105 * Converts charset to HTML-entities to ensure valid parsing.
1106 */
1107 private function convertToHtmlEntities(string $htmlContent, string $charset = 'UTF-8'): string
1108 {
1109 set_error_handler(static fn () => throw new \Exception());
1110
1111 try {
1112 return mb_encode_numericentity($htmlContent, [0x80, 0x10FFFF, 0, 0x1FFFFF], $charset);
1113 } catch (\Exception|\ValueError) {
1114 try {
1115 $htmlContent = iconv($charset, 'UTF-8', $htmlContent);
1116 $htmlContent = mb_encode_numericentity($htmlContent, [0x80, 0x10FFFF, 0, 0x1FFFFF], 'UTF-8');
1117 } catch (\Exception|\ValueError) {
1118 }
1119
1120 return $htmlContent;
1121 } finally {
1122 restore_error_handler();
1123 }
1124 }
1125
1126 /**
1127 * @throws \InvalidArgumentException
1128 */
1129 private function createDOMXPath(\DOMDocument $document, array $prefixes = []): \DOMXPath
1130 {
1131 $domxpath = new \DOMXPath($document);
1132
1133 foreach ($prefixes as $prefix) {
1134 $namespace = $this->discoverNamespace($domxpath, $prefix);
1135 if (null !== $namespace) {
1136 $domxpath->registerNamespace($prefix, $namespace);
1137 }
1138 }
1139
1140 return $domxpath;
1141 }
1142
1143 /**
1144 * @throws \InvalidArgumentException
1145 */
1146 private function discoverNamespace(\DOMXPath $domxpath, string $prefix): ?string
1147 {
1148 if (\array_key_exists($prefix, $this->namespaces)) {
1149 return $this->namespaces[$prefix];
1150 }
1151
1152 if ($this->cachedNamespaces->offsetExists($prefix)) {
1153 return $this->cachedNamespaces[$prefix];
1154 }
1155
1156 // ask for one namespace, otherwise we'd get a collection with an item for each node
1157 $namespaces = $domxpath->query(\sprintf('(//namespace::*[name()="%s"])[last()]', $this->defaultNamespacePrefix === $prefix ? '' : $prefix));
1158
1159 return $this->cachedNamespaces[$prefix] = ($node = $namespaces->item(0)) ? $node->nodeValue : null;
1160 }
1161
1162 private function findNamespacePrefixes(string $xpath): array
1163 {
1164 if (preg_match_all('/(?P<prefix>[a-z_][a-z_0-9\-\.]*+):[^"\/:]/i', $xpath, $matches)) {
1165 return array_unique($matches['prefix']);
1166 }
1167
1168 return [];
1169 }
1170
1171 /**
1172 * Creates a crawler for some subnodes.
1173 *
1174 * @param \DOMNodeList|\DOMNode|\DOMNode[]|string|null $nodes
1175 */
1176 private function createSubCrawler(\DOMNodeList|\DOMNode|array|string|null $nodes): static
1177 {
1178 $crawler = new static($nodes, $this->uri, $this->baseHref);
1179 $crawler->isHtml = $this->isHtml;
1180 $crawler->document = $this->document;
1181 $crawler->namespaces = $this->namespaces;
1182 $crawler->cachedNamespaces = $this->cachedNamespaces;
1183 $crawler->html5Parser = $this->html5Parser;
1184
1185 return $crawler;
1186 }
1187
1188 /**
1189 * @throws \LogicException If the CssSelector Component is not available
1190 */
1191 private function createCssSelectorConverter(): CssSelectorConverter
1192 {
1193 if (!class_exists(CssSelectorConverter::class)) {
1194 throw new \LogicException('To filter with a CSS selector, install the CssSelector component ("composer require symfony/css-selector"). Or use filterXpath instead.');
1195 }
1196
1197 return new CssSelectorConverter($this->isHtml);
1198 }
1199
1200 /**
1201 * Parse string into DOMDocument object using HTML5 parser if the content is HTML5 and the library is available.
1202 * Use libxml parser otherwise.
1203 */
1204 private function parseHtmlString(string $content, string $charset): \DOMDocument
1205 {
1206 if ($this->canParseHtml5String($content)) {
1207 return $this->parseHtml5($content, $charset);
1208 }
1209
1210 return $this->parseXhtml($content, $charset);
1211 }
1212
1213 private function canParseHtml5String(string $content): bool
1214 {
1215 if (!$this->html5Parser) {
1216 return false;
1217 }
1218
1219 if (false === ($pos = stripos($content, '<!doctype html>'))) {
1220 return false;
1221 }
1222
1223 $header = substr($content, 0, $pos);
1224
1225 return '' === $header || $this->isValidHtml5Heading($header);
1226 }
1227
1228 private function isValidHtml5Heading(string $heading): bool
1229 {
1230 return 1 === preg_match('/^\x{FEFF}?\s*(<!--[^>]*?-->\s*)*$/u', $heading);
1231 }
1232
1233 private function normalizeWhitespace(string $string): string
1234 {
1235 return trim(preg_replace("/(?:[ \n\r\t\x0C]{2,}+|[\n\r\t\x0C])/", ' ', $string), " \n\r\t\x0C");
1236 }
1237}