1 | 2 | simandl | <?php |
2 | | | |
3 | | | /************************************************************************* |
4 | | | * * |
5 | | | * class.html2text.inc * |
6 | | | * * |
7 | | | ************************************************************************* |
8 | | | * * |
9 | | | * Converts HTML to formatted plain text * |
10 | | | * * |
11 | | | * Copyright (c) 2005-2007 Jon Abernathy <jon@chuggnutt.com> * |
12 | | | * All rights reserved. * |
13 | | | * * |
14 | | | * This script is free software; you can redistribute it and/or modify * |
15 | | | * it under the terms of the GNU General Public License as published by * |
16 | | | * the Free Software Foundation; either version 2 of the License, or * |
17 | | | * (at your option) any later version. * |
18 | | | * * |
19 | | | * The GNU General Public License can be found at * |
20 | | | * http://www.gnu.org/copyleft/gpl.html. * |
21 | | | * * |
22 | | | * This script is distributed in the hope that it will be useful, * |
23 | | | * but WITHOUT ANY WARRANTY; without even the implied warranty of * |
24 | | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * |
25 | | | * GNU General Public License for more details. * |
26 | | | * * |
27 | | | * Author(s): Jon Abernathy <jon@chuggnutt.com> * |
28 | | | * * |
29 | | | * Last modified: 08/08/07 * |
30 | | | * * |
31 | | | *************************************************************************/ |
32 | | | |
33 | | | |
34 | | | /** |
35 | | | * Takes HTML and converts it to formatted, plain text. |
36 | | | * |
37 | | | * Thanks to Alexander Krug (http://www.krugar.de/) to pointing out and |
38 | | | * correcting an error in the regexp search array. Fixed 7/30/03. |
39 | | | * |
40 | | | * Updated set_html() function's file reading mechanism, 9/25/03. |
41 | | | * |
42 | | | * Thanks to Joss Sanglier (http://www.dancingbear.co.uk/) for adding |
43 | | | * several more HTML entity codes to the $search and $replace arrays. |
44 | | | * Updated 11/7/03. |
45 | | | * |
46 | | | * Thanks to Darius Kasperavicius (http://www.dar.dar.lt/) for |
47 | | | * suggesting the addition of $allowed_tags and its supporting function |
48 | | | * (which I slightly modified). Updated 3/12/04. |
49 | | | * |
50 | | | * Thanks to Justin Dearing for pointing out that a replacement for the |
51 | | | * <TH> tag was missing, and suggesting an appropriate fix. |
52 | | | * Updated 8/25/04. |
53 | | | * |
54 | | | * Thanks to Mathieu Collas (http://www.myefarm.com/) for finding a |
55 | | | * display/formatting bug in the _build_link_list() function: email |
56 | | | * readers would show the left bracket and number ("[1") as part of the |
57 | | | * rendered email address. |
58 | | | * Updated 12/16/04. |
59 | | | * |
60 | | | * Thanks to Wojciech Bajon (http://histeria.pl/) for submitting code |
61 | | | * to handle relative links, which I hadn't considered. I modified his |
62 | | | * code a bit to handle normal HTTP links and MAILTO links. Also for |
63 | | | * suggesting three additional HTML entity codes to search for. |
64 | | | * Updated 03/02/05. |
65 | | | * |
66 | | | * Thanks to Jacob Chandler for pointing out another link condition |
67 | | | * for the _build_link_list() function: "https". |
68 | | | * Updated 04/06/05. |
69 | | | * |
70 | | | * Thanks to Marc Bertrand (http://www.dresdensky.com/) for |
71 | | | * suggesting a revision to the word wrapping functionality; if you |
72 | | | * specify a $width of 0 or less, word wrapping will be ignored. |
73 | | | * Updated 11/02/06. |
74 | | | * |
75 | | | * *** Big housecleaning updates below: |
76 | | | * |
77 | | | * Thanks to Colin Brown (http://www.sparkdriver.co.uk/) for |
78 | | | * suggesting the fix to handle </li> and blank lines (whitespace). |
79 | | | * Christian Basedau (http://www.movetheweb.de/) also suggested the |
80 | | | * blank lines fix. |
81 | | | * |
82 | | | * Special thanks to Marcus Bointon (http://www.synchromedia.co.uk/), |
83 | | | * Christian Basedau, Norbert Laposa (http://ln5.co.uk/), |
84 | | | * Bas van de Weijer, and Marijn van Butselaar |
85 | | | * for pointing out my glaring error in the <th> handling. Marcus also |
86 | | | * supplied a host of fixes. |
87 | | | * |
88 | | | * Thanks to Jeffrey Silverman (http://www.newtnotes.com/) for pointing |
89 | | | * out that extra spaces should be compressed--a problem addressed with |
90 | | | * Marcus Bointon's fixes but that I had not yet incorporated. |
91 | | | * |
92 | | | * Thanks to Daniel Schledermann (http://www.typoconsult.dk/) for |
93 | | | * suggesting a valuable fix with <a> tag handling. |
94 | | | * |
95 | | | * Thanks to Wojciech Bajon (again!) for suggesting fixes and additions, |
96 | | | * including the <a> tag handling that Daniel Schledermann pointed |
97 | | | * out but that I had not yet incorporated. I haven't (yet) |
98 | | | * incorporated all of Wojciech's changes, though I may at some |
99 | | | * future time. |
100 | | | * |
101 | | | * *** End of the housecleaning updates. Updated 08/08/07. |
102 | | | * |
103 | | | * @author Jon Abernathy <jon@chuggnutt.com> |
104 | | | * @version 1.0.0 |
105 | | | * @since PHP 4.0.2 |
106 | | | */ |
107 | | | class html2text |
108 | | | { |
109 | | | |
110 | | | /** |
111 | | | * Contains the HTML content to convert. |
112 | | | * |
113 | | | * @var string $html |
114 | | | * @access public |
115 | | | */ |
116 | | | var $html; |
117 | | | |
118 | | | /** |
119 | | | * Contains the converted, formatted text. |
120 | | | * |
121 | | | * @var string $text |
122 | | | * @access public |
123 | | | */ |
124 | | | var $text; |
125 | | | |
126 | | | /** |
127 | | | * Maximum width of the formatted text, in columns. |
128 | | | * |
129 | | | * Set this value to 0 (or less) to ignore word wrapping |
130 | | | * and not constrain text to a fixed-width column. |
131 | | | * |
132 | | | * @var integer $width |
133 | | | * @access public |
134 | | | */ |
135 | | | var $width = 70; |
136 | | | |
137 | | | /** |
138 | | | * List of preg* regular expression patterns to search for, |
139 | | | * used in conjunction with $replace. |
140 | | | * |
141 | | | * @var array $search |
142 | | | * @access public |
143 | | | * @see $replace |
144 | | | */ |
145 | | | var $search = array( |
146 | | | "/\r/", // Non-legal carriage return |
147 | | | "/[\n\t]+/", // Newlines and tabs |
148 | | | '/[ ]{2,}/', // Runs of spaces, pre-handling |
149 | | | '/<script[^>]*>.*?<\/script>/i', // <script>s -- which strip_tags supposedly has problems with |
150 | | | '/<style[^>]*>.*?<\/style>/i', // <style>s -- which strip_tags supposedly has problems with |
151 | | | //'/<!-- .* -->/', // Comments -- which strip_tags might have problem a with |
152 | | | '/<h[123][^>]*>(.*?)<\/h[123]>/ie', // H1 - H3 |
153 | | | '/<h[456][^>]*>(.*?)<\/h[456]>/ie', // H4 - H6 |
154 | | | '/<p[^>]*>/i', // <P> |
155 | | | '/<br[^>]*>/i', // <br> |
156 | | | '/<b[^>]*>(.*?)<\/b>/ie', // <b> |
157 | | | '/<strong[^>]*>(.*?)<\/strong>/ie', // <strong> |
158 | | | '/<i[^>]*>(.*?)<\/i>/i', // <i> |
159 | | | '/<em[^>]*>(.*?)<\/em>/i', // <em> |
160 | | | '/(<ul[^>]*>|<\/ul>)/i', // <ul> and </ul> |
161 | | | '/(<ol[^>]*>|<\/ol>)/i', // <ol> and </ol> |
162 | | | '/<li[^>]*>(.*?)<\/li>/i', // <li> and </li> |
163 | | | '/<li[^>]*>/i', // <li> |
164 | | | '/<a [^>]*href="([^"]+)"[^>]*>(.*?)<\/a>/ie', |
165 | | | // <a href=""> |
166 | | | '/<hr[^>]*>/i', // <hr> |
167 | | | '/(<table[^>]*>|<\/table>)/i', // <table> and </table> |
168 | | | '/(<tr[^>]*>|<\/tr>)/i', // <tr> and </tr> |
169 | | | '/<td[^>]*>(.*?)<\/td>/i', // <td> and </td> |
170 | | | '/<th[^>]*>(.*?)<\/th>/ie', // <th> and </th> |
171 | | | '/&(nbsp|#160);/i', // Non-breaking space |
172 | | | '/&(quot|rdquo|ldquo|#8220|#8221|#147|#148);/i', |
173 | | | // Double quotes |
174 | | | '/&(apos|rsquo|lsquo|#8216|#8217);/i', // Single quotes |
175 | | | '/>/i', // Greater-than |
176 | | | '/</i', // Less-than |
177 | | | '/&(amp|#38);/i', // Ampersand |
178 | | | '/&(copy|#169);/i', // Copyright |
179 | | | '/&(trade|#8482|#153);/i', // Trademark |
180 | | | '/&(reg|#174);/i', // Registered |
181 | | | '/&(mdash|#151|#8212);/i', // mdash |
182 | | | '/&(ndash|minus|#8211|#8722);/i', // ndash |
183 | | | '/&(bull|#149|#8226);/i', // Bullet |
184 | | | '/&(pound|#163);/i', // Pound sign |
185 | | | '/&(euro|#8364);/i', // Euro sign |
186 | | | '/&[^&;]+;/i', // Unknown/unhandled entities |
187 | | | '/[ ]{2,}/' // Runs of spaces, post-handling |
188 | | | ); |
189 | | | |
190 | | | /** |
191 | | | * List of pattern replacements corresponding to patterns searched. |
192 | | | * |
193 | | | * @var array $replace |
194 | | | * @access public |
195 | | | * @see $search |
196 | | | */ |
197 | | | var $replace = array( |
198 | | | '', // Non-legal carriage return |
199 | | | ' ', // Newlines and tabs |
200 | | | ' ', // Runs of spaces, pre-handling |
201 | | | '', // <script>s -- which strip_tags supposedly has problems with |
202 | | | '', // <style>s -- which strip_tags supposedly has problems with |
203 | | | //'', // Comments -- which strip_tags might have problem a with |
204 | | | "strtoupper(\"\n\n\\1\n\n\")", // H1 - H3 |
205 | | | "ucwords(\"\n\n\\1\n\n\")", // H4 - H6 |
206 | | | "\n\n\t", // <P> |
207 | | | "\n", // <br> |
208 | | | 'strtoupper("\\1")', // <b> |
209 | | | 'strtoupper("\\1")', // <strong> |
210 | | | '_\\1_', // <i> |
211 | | | '_\\1_', // <em> |
212 | | | "\n\n", // <ul> and </ul> |
213 | | | "\n\n", // <ol> and </ol> |
214 | | | "\t* \\1\n", // <li> and </li> |
215 | | | "\n\t* ", // <li> |
216 | | | '$this->_build_link_list("\\1", "\\2")', |
217 | | | // <a href=""> |
218 | | | "\n-------------------------\n", // <hr> |
219 | | | "\n\n", // <table> and </table> |
220 | | | "\n", // <tr> and </tr> |
221 | | | "\t\t\\1\n", // <td> and </td> |
222 | | | "strtoupper(\"\t\t\\1\n\")", // <th> and </th> |
223 | | | ' ', // Non-breaking space |
224 | | | '"', // Double quotes |
225 | | | "'", // Single quotes |
226 | | | '>', |
227 | | | '<', |
228 | | | '&', |
229 | | | '(c)', |
230 | | | '(tm)', |
231 | | | '(R)', |
232 | | | '--', |
233 | | | '-', |
234 | | | '*', |
235 | | | 'ďż˝', |
236 | | | 'EUR', // Euro sign. ďż˝ ? |
237 | | | '', // Unknown/unhandled entities |
238 | | | ' ' // Runs of spaces, post-handling |
239 | | | ); |
240 | | | |
241 | | | /** |
242 | | | * Contains a list of HTML tags to allow in the resulting text. |
243 | | | * |
244 | | | * @var string $allowed_tags |
245 | | | * @access public |
246 | | | * @see set_allowed_tags() |
247 | | | */ |
248 | | | var $allowed_tags = ''; |
249 | | | |
250 | | | /** |
251 | | | * Contains the base URL that relative links should resolve to. |
252 | | | * |
253 | | | * @var string $url |
254 | | | * @access public |
255 | | | */ |
256 | | | var $url; |
257 | | | |
258 | | | /** |
259 | | | * Indicates whether content in the $html variable has been converted yet. |
260 | | | * |
261 | | | * @var boolean $_converted |
262 | | | * @access private |
263 | | | * @see $html, $text |
264 | | | */ |
265 | | | var $_converted = false; |
266 | | | |
267 | | | /** |
268 | | | * Contains URL addresses from links to be rendered in plain text. |
269 | | | * |
270 | | | * @var string $_link_list |
271 | | | * @access private |
272 | | | * @see _build_link_list() |
273 | | | */ |
274 | | | var $_link_list = ''; |
275 | | | |
276 | | | /** |
277 | | | * Number of valid links detected in the text, used for plain text |
278 | | | * display (rendered similar to footnotes). |
279 | | | * |
280 | | | * @var integer $_link_count |
281 | | | * @access private |
282 | | | * @see _build_link_list() |
283 | | | */ |
284 | | | var $_link_count = 0; |
285 | | | |
286 | | | /** |
287 | | | * Constructor. |
288 | | | * |
289 | | | * If the HTML source string (or file) is supplied, the class |
290 | | | * will instantiate with that source propagated, all that has |
291 | | | * to be done it to call get_text(). |
292 | | | * |
293 | | | * @param string $source HTML content |
294 | | | * @param boolean $from_file Indicates $source is a file to pull content from |
295 | | | * @access public |
296 | | | * @return void |
297 | | | */ |
298 | | | function html2text( $source = '', $from_file = false ) |
299 | | | { |
300 | | | if ( !empty($source) ) { |
301 | | | $this->set_html($source, $from_file); |
302 | | | } |
303 | | | $this->set_base_url(); |
304 | | | } |
305 | | | |
306 | | | /** |
307 | | | * Loads source HTML into memory, either from $source string or a file. |
308 | | | * |
309 | | | * @param string $source HTML content |
310 | | | * @param boolean $from_file Indicates $source is a file to pull content from |
311 | | | * @access public |
312 | | | * @return void |
313 | | | */ |
314 | | | function set_html( $source, $from_file = false ) |
315 | | | { |
316 | | | $this->html = $source; |
317 | | | |
318 | | | if ( $from_file && file_exists($source) ) { |
319 | | | $fp = fopen($source, 'r'); |
320 | | | $this->html = fread($fp, filesize($source)); |
321 | | | fclose($fp); |
322 | | | } |
323 | | | |
324 | | | $this->_converted = false; |
325 | | | } |
326 | | | |
327 | | | /** |
328 | | | * Returns the text, converted from HTML. |
329 | | | * |
330 | | | * @access public |
331 | | | * @return string |
332 | | | */ |
333 | | | function get_text() |
334 | | | { |
335 | | | if ( !$this->_converted ) { |
336 | | | $this->_convert(); |
337 | | | } |
338 | | | |
339 | | | return $this->text; |
340 | | | } |
341 | | | |
342 | | | /** |
343 | | | * Prints the text, converted from HTML. |
344 | | | * |
345 | | | * @access public |
346 | | | * @return void |
347 | | | */ |
348 | | | function print_text() |
349 | | | { |
350 | | | print $this->get_text(); |
351 | | | } |
352 | | | |
353 | | | /** |
354 | | | * Alias to print_text(), operates identically. |
355 | | | * |
356 | | | * @access public |
357 | | | * @return void |
358 | | | * @see print_text() |
359 | | | */ |
360 | | | function p() |
361 | | | { |
362 | | | print $this->get_text(); |
363 | | | } |
364 | | | |
365 | | | /** |
366 | | | * Sets the allowed HTML tags to pass through to the resulting text. |
367 | | | * |
368 | | | * Tags should be in the form "<p>", with no corresponding closing tag. |
369 | | | * |
370 | | | * @access public |
371 | | | * @return void |
372 | | | */ |
373 | | | function set_allowed_tags( $allowed_tags = '' ) |
374 | | | { |
375 | | | if ( !empty($allowed_tags) ) { |
376 | | | $this->allowed_tags = $allowed_tags; |
377 | | | } |
378 | | | } |
379 | | | |
380 | | | /** |
381 | | | * Sets a base URL to handle relative links. |
382 | | | * |
383 | | | * @access public |
384 | | | * @return void |
385 | | | */ |
386 | | | function set_base_url( $url = '' ) |
387 | | | { |
388 | | | if ( empty($url) ) { |
389 | | | if ( !empty($_SERVER['HTTP_HOST']) ) { |
390 | | | $this->url = 'http://' . $_SERVER['HTTP_HOST']; |
391 | | | } else { |
392 | | | $this->url = ''; |
393 | | | } |
394 | | | } else { |
395 | | | // Strip any trailing slashes for consistency (relative |
396 | | | // URLs may already start with a slash like "/file.html") |
397 | | | if ( substr($url, -1) == '/' ) { |
398 | | | $url = substr($url, 0, -1); |
399 | | | } |
400 | | | $this->url = $url; |
401 | | | } |
402 | | | } |
403 | | | |
404 | | | /** |
405 | | | * Workhorse function that does actual conversion. |
406 | | | * |
407 | | | * First performs custom tag replacement specified by $search and |
408 | | | * $replace arrays. Then strips any remaining HTML tags, reduces whitespace |
409 | | | * and newlines to a readable format, and word wraps the text to |
410 | | | * $width characters. |
411 | | | * |
412 | | | * @access private |
413 | | | * @return void |
414 | | | */ |
415 | | | function _convert() |
416 | | | { |
417 | | | // Variables used for building the link list |
418 | | | $this->_link_count = 0; |
419 | | | $this->_link_list = ''; |
420 | | | |
421 | | | $text = trim(stripslashes($this->html)); |
422 | | | |
423 | | | // Run our defined search-and-replace |
424 | | | $text = preg_replace($this->search, $this->replace, $text); |
425 | | | |
426 | | | // Strip any other HTML tags |
427 | | | $text = strip_tags($text, $this->allowed_tags); |
428 | | | |
429 | | | // Bring down number of empty lines to 2 max |
430 | | | $text = preg_replace("/\n\s+\n/", "\n\n", $text); |
431 | | | $text = preg_replace("/[\n]{3,}/", "\n\n", $text); |
432 | | | |
433 | | | // Add link list |
434 | | | if ( !empty($this->_link_list) ) { |
435 | | | $text .= "\n\nLinks:\n------\n" . $this->_link_list; |
436 | | | } |
437 | | | |
438 | | | // Wrap the text to a readable format |
439 | | | // for PHP versions >= 4.0.2. Default width is 75 |
440 | | | // If width is 0 or less, don't wrap the text. |
441 | | | if ( $this->width > 0 ) { |
442 | | | $text = wordwrap($text, $this->width); |
443 | | | } |
444 | | | |
445 | | | $this->text = $text; |
446 | | | |
447 | | | $this->_converted = true; |
448 | | | } |
449 | | | |
450 | | | /** |
451 | | | * Helper function called by preg_replace() on link replacement. |
452 | | | * |
453 | | | * Maintains an internal list of links to be displayed at the end of the |
454 | | | * text, with numeric indices to the original point in the text they |
455 | | | * appeared. Also makes an effort at identifying and handling absolute |
456 | | | * and relative links. |
457 | | | * |
458 | | | * @param string $link URL of the link |
459 | | | * @param string $display Part of the text to associate number with |
460 | | | * @access private |
461 | | | * @return string |
462 | | | */ |
463 | | | function _build_link_list( $link, $display ) |
464 | | | { |
465 | | | if ( substr($link, 0, 7) == 'http://' || substr($link, 0, 8) == 'https://' || |
466 | | | substr($link, 0, 7) == 'mailto:' ) { |
467 | | | $this->_link_count++; |
468 | | | $this->_link_list .= "[" . $this->_link_count . "] $link\n"; |
469 | | | $additional = ' [' . $this->_link_count . ']'; |
470 | | | } elseif ( substr($link, 0, 11) == 'javascript:' ) { |
471 | | | // Don't count the link; ignore it |
472 | | | $additional = ''; |
473 | | | // what about href="#anchor" ? |
474 | | | } else { |
475 | | | $this->_link_count++; |
476 | | | $this->_link_list .= "[" . $this->_link_count . "] " . $this->url; |
477 | | | if ( substr($link, 0, 1) != '/' ) { |
478 | | | $this->_link_list .= '/'; |
479 | | | } |
480 | | | $this->_link_list .= "$link\n"; |
481 | | | $additional = ' [' . $this->_link_count . ']'; |
482 | | | } |
483 | | | |
484 | | | return $display . $additional; |
485 | | | } |
486 | | | |
487 | | | } |