friendship ended with social-app. php is my new best friend
1<?php
2
3namespace Masterminds;
4
5use Masterminds\HTML5\Parser\DOMTreeBuilder;
6use Masterminds\HTML5\Parser\Scanner;
7use Masterminds\HTML5\Parser\Tokenizer;
8use Masterminds\HTML5\Serializer\OutputRules;
9use Masterminds\HTML5\Serializer\Traverser;
10
11/**
12 * This class offers convenience methods for parsing and serializing HTML5.
13 * It is roughly designed to mirror the \DOMDocument native class.
14 */
15class HTML5
16{
17 /**
18 * Global options for the parser and serializer.
19 *
20 * @var array
21 */
22 private $defaultOptions = array(
23 // Whether the serializer should aggressively encode all characters as entities.
24 'encode_entities' => false,
25
26 // Prevents the parser from automatically assigning the HTML5 namespace to the DOM document.
27 'disable_html_ns' => false,
28 );
29
30 protected $errors = array();
31
32 public function __construct(array $defaultOptions = array())
33 {
34 $this->defaultOptions = array_merge($this->defaultOptions, $defaultOptions);
35 }
36
37 /**
38 * Get the current default options.
39 *
40 * @return array
41 */
42 public function getOptions()
43 {
44 return $this->defaultOptions;
45 }
46
47 /**
48 * Load and parse an HTML file.
49 *
50 * This will apply the HTML5 parser, which is tolerant of many
51 * varieties of HTML, including XHTML 1, HTML 4, and well-formed HTML
52 * 3. Note that in these cases, not all of the old data will be
53 * preserved. For example, XHTML's XML declaration will be removed.
54 *
55 * The rules governing parsing are set out in the HTML 5 spec.
56 *
57 * @param string|resource $file The path to the file to parse. If this is a resource, it is
58 * assumed to be an open stream whose pointer is set to the first
59 * byte of input.
60 * @param array $options Configuration options when parsing the HTML.
61 *
62 * @return \DOMDocument A DOM document. These object type is defined by the libxml
63 * library, and should have been included with your version of PHP.
64 */
65 public function load($file, array $options = array())
66 {
67 // Handle the case where file is a resource.
68 if (is_resource($file)) {
69 return $this->parse(stream_get_contents($file), $options);
70 }
71
72 return $this->parse(file_get_contents($file), $options);
73 }
74
75 /**
76 * Parse a HTML Document from a string.
77 *
78 * Take a string of HTML 5 (or earlier) and parse it into a
79 * DOMDocument.
80 *
81 * @param string $string A html5 document as a string.
82 * @param array $options Configuration options when parsing the HTML.
83 *
84 * @return \DOMDocument A DOM document. DOM is part of libxml, which is included with
85 * almost all distribtions of PHP.
86 */
87 public function loadHTML($string, array $options = array())
88 {
89 return $this->parse($string, $options);
90 }
91
92 /**
93 * Convenience function to load an HTML file.
94 *
95 * This is here to provide backwards compatibility with the
96 * PHP DOM implementation. It simply calls load().
97 *
98 * @param string $file The path to the file to parse. If this is a resource, it is
99 * assumed to be an open stream whose pointer is set to the first
100 * byte of input.
101 * @param array $options Configuration options when parsing the HTML.
102 *
103 * @return \DOMDocument A DOM document. These object type is defined by the libxml
104 * library, and should have been included with your version of PHP.
105 */
106 public function loadHTMLFile($file, array $options = array())
107 {
108 return $this->load($file, $options);
109 }
110
111 /**
112 * Parse a HTML fragment from a string.
113 *
114 * @param string $string the HTML5 fragment as a string
115 * @param array $options Configuration options when parsing the HTML
116 *
117 * @return \DOMDocumentFragment A DOM fragment. The DOM is part of libxml, which is included with
118 * almost all distributions of PHP.
119 */
120 public function loadHTMLFragment($string, array $options = array())
121 {
122 return $this->parseFragment($string, $options);
123 }
124
125 /**
126 * Return all errors encountered into parsing phase.
127 *
128 * @return array
129 */
130 public function getErrors()
131 {
132 return $this->errors;
133 }
134
135 /**
136 * Return true it some errors were encountered into parsing phase.
137 *
138 * @return bool
139 */
140 public function hasErrors()
141 {
142 return count($this->errors) > 0;
143 }
144
145 /**
146 * Parse an input string.
147 *
148 * @param string $input
149 *
150 * @return \DOMDocument
151 */
152 public function parse($input, array $options = array())
153 {
154 $this->errors = array();
155 $options = array_merge($this->defaultOptions, $options);
156 $events = new DOMTreeBuilder(false, $options);
157 $scanner = new Scanner($input, !empty($options['encoding']) ? $options['encoding'] : 'UTF-8');
158 $parser = new Tokenizer($scanner, $events, !empty($options['xmlNamespaces']) ? Tokenizer::CONFORMANT_XML : Tokenizer::CONFORMANT_HTML);
159
160 $parser->parse();
161 $this->errors = $events->getErrors();
162
163 return $events->document();
164 }
165
166 /**
167 * Parse an input stream where the stream is a fragment.
168 *
169 * Lower-level loading function. This requires an input stream instead
170 * of a string, file, or resource.
171 *
172 * @param string $input The input data to parse in the form of a string.
173 * @param array $options An array of options.
174 *
175 * @return \DOMDocumentFragment
176 */
177 public function parseFragment($input, array $options = array())
178 {
179 $options = array_merge($this->defaultOptions, $options);
180 $events = new DOMTreeBuilder(true, $options);
181 $scanner = new Scanner($input, !empty($options['encoding']) ? $options['encoding'] : 'UTF-8');
182 $parser = new Tokenizer($scanner, $events, !empty($options['xmlNamespaces']) ? Tokenizer::CONFORMANT_XML : Tokenizer::CONFORMANT_HTML);
183
184 $parser->parse();
185 $this->errors = $events->getErrors();
186
187 return $events->fragment();
188 }
189
190 /**
191 * Save a DOM into a given file as HTML5.
192 *
193 * @param mixed $dom The DOM to be serialized.
194 * @param string|resource $file The filename to be written or resource to write to.
195 * @param array $options Configuration options when serializing the DOM. These include:
196 * - encode_entities: Text written to the output is escaped by default and not all
197 * entities are encoded. If this is set to true all entities will be encoded.
198 * Defaults to false.
199 */
200 public function save($dom, $file, $options = array())
201 {
202 $close = true;
203 if (is_resource($file)) {
204 $stream = $file;
205 $close = false;
206 } else {
207 $stream = fopen($file, 'wb');
208 }
209 $options = array_merge($this->defaultOptions, $options);
210 $rules = new OutputRules($stream, $options);
211 $trav = new Traverser($dom, $stream, $rules, $options);
212
213 $trav->walk();
214 /*
215 * release the traverser to avoid cyclic references and allow PHP to free memory without waiting for gc_collect_cycles
216 */
217 $rules->unsetTraverser();
218 if ($close) {
219 fclose($stream);
220 }
221 }
222
223 /**
224 * Convert a DOM into an HTML5 string.
225 *
226 * @param mixed $dom The DOM to be serialized.
227 * @param array $options Configuration options when serializing the DOM. These include:
228 * - encode_entities: Text written to the output is escaped by default and not all
229 * entities are encoded. If this is set to true all entities will be encoded.
230 * Defaults to false.
231 *
232 * @return string A HTML5 documented generated from the DOM.
233 */
234 public function saveHTML($dom, $options = array())
235 {
236 $stream = fopen('php://temp', 'wb');
237 $this->save($dom, $stream, array_merge($this->defaultOptions, $options));
238
239 $html = stream_get_contents($stream, -1, 0);
240
241 fclose($stream);
242
243 return $html;
244 }
245}