friendship ended with social-app. php is my new best friend
at main 8.5 kB view raw
1<?php 2 3namespace Masterminds; 4 5use Masterminds\HTML5\Parser\DOMTreeBuilder; 6use Masterminds\HTML5\Parser\Scanner; 7use Masterminds\HTML5\Parser\Tokenizer; 8use Masterminds\HTML5\Serializer\OutputRules; 9use Masterminds\HTML5\Serializer\Traverser; 10 11/** 12 * This class offers convenience methods for parsing and serializing HTML5. 13 * It is roughly designed to mirror the \DOMDocument native class. 14 */ 15class HTML5 16{ 17 /** 18 * Global options for the parser and serializer. 19 * 20 * @var array 21 */ 22 private $defaultOptions = array( 23 // Whether the serializer should aggressively encode all characters as entities. 24 'encode_entities' => false, 25 26 // Prevents the parser from automatically assigning the HTML5 namespace to the DOM document. 27 'disable_html_ns' => false, 28 ); 29 30 protected $errors = array(); 31 32 public function __construct(array $defaultOptions = array()) 33 { 34 $this->defaultOptions = array_merge($this->defaultOptions, $defaultOptions); 35 } 36 37 /** 38 * Get the current default options. 39 * 40 * @return array 41 */ 42 public function getOptions() 43 { 44 return $this->defaultOptions; 45 } 46 47 /** 48 * Load and parse an HTML file. 49 * 50 * This will apply the HTML5 parser, which is tolerant of many 51 * varieties of HTML, including XHTML 1, HTML 4, and well-formed HTML 52 * 3. Note that in these cases, not all of the old data will be 53 * preserved. For example, XHTML's XML declaration will be removed. 54 * 55 * The rules governing parsing are set out in the HTML 5 spec. 56 * 57 * @param string|resource $file The path to the file to parse. If this is a resource, it is 58 * assumed to be an open stream whose pointer is set to the first 59 * byte of input. 60 * @param array $options Configuration options when parsing the HTML. 61 * 62 * @return \DOMDocument A DOM document. These object type is defined by the libxml 63 * library, and should have been included with your version of PHP. 64 */ 65 public function load($file, array $options = array()) 66 { 67 // Handle the case where file is a resource. 68 if (is_resource($file)) { 69 return $this->parse(stream_get_contents($file), $options); 70 } 71 72 return $this->parse(file_get_contents($file), $options); 73 } 74 75 /** 76 * Parse a HTML Document from a string. 77 * 78 * Take a string of HTML 5 (or earlier) and parse it into a 79 * DOMDocument. 80 * 81 * @param string $string A html5 document as a string. 82 * @param array $options Configuration options when parsing the HTML. 83 * 84 * @return \DOMDocument A DOM document. DOM is part of libxml, which is included with 85 * almost all distribtions of PHP. 86 */ 87 public function loadHTML($string, array $options = array()) 88 { 89 return $this->parse($string, $options); 90 } 91 92 /** 93 * Convenience function to load an HTML file. 94 * 95 * This is here to provide backwards compatibility with the 96 * PHP DOM implementation. It simply calls load(). 97 * 98 * @param string $file The path to the file to parse. If this is a resource, it is 99 * assumed to be an open stream whose pointer is set to the first 100 * byte of input. 101 * @param array $options Configuration options when parsing the HTML. 102 * 103 * @return \DOMDocument A DOM document. These object type is defined by the libxml 104 * library, and should have been included with your version of PHP. 105 */ 106 public function loadHTMLFile($file, array $options = array()) 107 { 108 return $this->load($file, $options); 109 } 110 111 /** 112 * Parse a HTML fragment from a string. 113 * 114 * @param string $string the HTML5 fragment as a string 115 * @param array $options Configuration options when parsing the HTML 116 * 117 * @return \DOMDocumentFragment A DOM fragment. The DOM is part of libxml, which is included with 118 * almost all distributions of PHP. 119 */ 120 public function loadHTMLFragment($string, array $options = array()) 121 { 122 return $this->parseFragment($string, $options); 123 } 124 125 /** 126 * Return all errors encountered into parsing phase. 127 * 128 * @return array 129 */ 130 public function getErrors() 131 { 132 return $this->errors; 133 } 134 135 /** 136 * Return true it some errors were encountered into parsing phase. 137 * 138 * @return bool 139 */ 140 public function hasErrors() 141 { 142 return count($this->errors) > 0; 143 } 144 145 /** 146 * Parse an input string. 147 * 148 * @param string $input 149 * 150 * @return \DOMDocument 151 */ 152 public function parse($input, array $options = array()) 153 { 154 $this->errors = array(); 155 $options = array_merge($this->defaultOptions, $options); 156 $events = new DOMTreeBuilder(false, $options); 157 $scanner = new Scanner($input, !empty($options['encoding']) ? $options['encoding'] : 'UTF-8'); 158 $parser = new Tokenizer($scanner, $events, !empty($options['xmlNamespaces']) ? Tokenizer::CONFORMANT_XML : Tokenizer::CONFORMANT_HTML); 159 160 $parser->parse(); 161 $this->errors = $events->getErrors(); 162 163 return $events->document(); 164 } 165 166 /** 167 * Parse an input stream where the stream is a fragment. 168 * 169 * Lower-level loading function. This requires an input stream instead 170 * of a string, file, or resource. 171 * 172 * @param string $input The input data to parse in the form of a string. 173 * @param array $options An array of options. 174 * 175 * @return \DOMDocumentFragment 176 */ 177 public function parseFragment($input, array $options = array()) 178 { 179 $options = array_merge($this->defaultOptions, $options); 180 $events = new DOMTreeBuilder(true, $options); 181 $scanner = new Scanner($input, !empty($options['encoding']) ? $options['encoding'] : 'UTF-8'); 182 $parser = new Tokenizer($scanner, $events, !empty($options['xmlNamespaces']) ? Tokenizer::CONFORMANT_XML : Tokenizer::CONFORMANT_HTML); 183 184 $parser->parse(); 185 $this->errors = $events->getErrors(); 186 187 return $events->fragment(); 188 } 189 190 /** 191 * Save a DOM into a given file as HTML5. 192 * 193 * @param mixed $dom The DOM to be serialized. 194 * @param string|resource $file The filename to be written or resource to write to. 195 * @param array $options Configuration options when serializing the DOM. These include: 196 * - encode_entities: Text written to the output is escaped by default and not all 197 * entities are encoded. If this is set to true all entities will be encoded. 198 * Defaults to false. 199 */ 200 public function save($dom, $file, $options = array()) 201 { 202 $close = true; 203 if (is_resource($file)) { 204 $stream = $file; 205 $close = false; 206 } else { 207 $stream = fopen($file, 'wb'); 208 } 209 $options = array_merge($this->defaultOptions, $options); 210 $rules = new OutputRules($stream, $options); 211 $trav = new Traverser($dom, $stream, $rules, $options); 212 213 $trav->walk(); 214 /* 215 * release the traverser to avoid cyclic references and allow PHP to free memory without waiting for gc_collect_cycles 216 */ 217 $rules->unsetTraverser(); 218 if ($close) { 219 fclose($stream); 220 } 221 } 222 223 /** 224 * Convert a DOM into an HTML5 string. 225 * 226 * @param mixed $dom The DOM to be serialized. 227 * @param array $options Configuration options when serializing the DOM. These include: 228 * - encode_entities: Text written to the output is escaped by default and not all 229 * entities are encoded. If this is set to true all entities will be encoded. 230 * Defaults to false. 231 * 232 * @return string A HTML5 documented generated from the DOM. 233 */ 234 public function saveHTML($dom, $options = array()) 235 { 236 $stream = fopen('php://temp', 'wb'); 237 $this->save($dom, $stream, array_merge($this->defaultOptions, $options)); 238 239 $html = stream_get_contents($stream, -1, 0); 240 241 fclose($stream); 242 243 return $html; 244 } 245}