File Html.php | Jyxo PHP Library

  1: <?php
  2: 
  3: /**
  4:  * Jyxo PHP Library
  5:  *
  6:  * LICENSE
  7:  *
  8:  * This source file is subject to the new BSD license that is bundled
  9:  * with this package in the file license.txt.
 10:  * It is also available through the world-wide-web at this URL:
 11:  * https://github.com/jyxo/php/blob/master/license.txt
 12:  */
 13: 
 14: /**
 15:  * Functions for HTML processing.
 16:  *
 17:  * @category Jyxo
 18:  * @package Jyxo_Html
 19:  * @copyright Copyright (c) 2005-2011 Jyxo, s.r.o.
 20:  * @license https://github.com/jyxo/php/blob/master/license.txt
 21:  * @author Jaroslav Hanslík
 22:  */
 23: class Jyxo_Html
 24: {
 25:     /**
 26:      * Constructor preventing from creating instances of a static class.
 27:      *
 28:      * @throws LogicException If trying to create an instance
 29:      */
 30:     public final function __construct()
 31:     {
 32:         throw new LogicException(sprintf('Cannot create an instance of a static class %s.', get_class($this)));
 33:     }
 34: 
 35:     /**
 36:      * Tests if the given text contains at least one HTML tag.
 37:      * It is just an estimation.
 38:      *
 39:      * @param string $text Input text to be tested
 40:      * @return boolean
 41:      */
 42:     public static function is($text)
 43:     {
 44:         return (bool) preg_match('~<[a-z][a-z0-9]*(\s[^<]*)?>~i', $text);
 45:     }
 46: 
 47:     /**
 48:      * Fixes an invalid HTML source, unifies quotes and removes unnecessary whitespace.
 49:      * Required the Tidy PHP extension.
 50:      *
 51:      * @param string $html Input HTML source
 52:      * @return string
 53:      */
 54:     public static function repair($html)
 55:     {
 56:         // HTML fixing
 57:         static $config = array(
 58:             'newline' => 'LF',              // Uses LF line endings
 59:             'indent' => false,              // Removes indent
 60:             'output-xhtml' => true,         // Output will be in XHTML format
 61:             'output-bom' => false,          // No BOM
 62:             'doctype' => 'auto',            // Automatic doctype
 63:             // 'clean' => true,             // Removes presentation tags (inline styles would be moved into <style> elements)
 64:             'bare' => true,                 // Cleans MS HTML mess
 65:             'wrap' => 0,                    // No wrapping
 66:             'wrap-sections' => false,       // No <![ ... ]> wrapping
 67:             // 'quote-marks' => true,       // Replaces quotes with appropriate entities (causes problems with later regular expression processing)
 68:             // 'logical-emphasis' => true,  // Replaces all <i> and <b> tags with <em> and <strong> (styles cannot be parsed after)
 69:             'enclose-text' => true,         // Text inside <body> encapsulates with a <p> tag
 70:             'merge-divs' => false,          // Disables <div> merging
 71:             'merge-spans' => false,         // Disables <span> merging
 72:             // 'hide-comments' => true,     // Removes comments (it would remove conditional comments used when inserting Flash)
 73:             'force-output' => true,         // Makes output even on error
 74:             'show-errors' => 0,             // Don't show any errors
 75:             'show-warnings' => false,       // Don't show any warnings
 76:             'escape-cdata' => true,         // Makes an ordinary text from CDATA blocks
 77:             'preserve-entities' => true     // Preserves correctly formatted entities
 78:             // 'drop-proprietary-attributes' => true,   // Removes proprietary attributes (it would remove e.g. the background attribute)
 79:             // 'drop-font-tags' => true     // Removes <FONT> and <CENTER> tags
 80:         );
 81:         $html = tidy_repair_string($html, $config, 'utf8');
 82: 
 83:         // Removes namespace <?xml:namespace prefix = o ns = "urn:schemas-microsoft-com:office:office" /? > generated by MS Word
 84:         $html = preg_replace('~<\?xml:namespace[^>]*>~i', '', $html);
 85: 
 86:         // Removes unnecessary line breaks and keeps them inside <pre> elements
 87:         // Tidy adds one more line breaks inside <pre> elements
 88:         $html = preg_replace("~(<pre[^>]*>)\n~", '\1', $html);
 89:         $html = preg_replace("~\n</pre>~", '</pre>', $html);
 90:         $html = preg_replace_callback('~(<pre[^>]*>)(.+?)(</pre>)~s', function($matches) {
 91:             return $matches[1] . strtr(nl2br($matches[2]), array('\"' => '"')) . $matches[3];
 92:         }, $html);
 93:         // Strip line breaks
 94:         $html = strtr($html, array("\r" => '', "\n" => ''));
 95: 
 96:         // Replace single quotes with double quotes (for easier processing later)
 97:         $html = preg_replace('~(<[a-z][a-z0-9]*[^>]+[a-z]+=)\'([^\']*)\'~i', '\1"\2"', $html);
 98: 
 99:         // Remove unnecessary spaces inside elements (for easier processing later)
100:         $html = preg_replace('~(<[a-z][a-z0-9]*[^>]+[a-z]+=")\s+([^"]*")~i', '\1\2', $html);
101:         $html = preg_replace('~(<[a-z][a-z0-9]*[^>]+[a-z]+="[^"]*)\s+(")~i', '\1\2', $html);
102: 
103:         return $html;
104:     }
105: 
106:     /**
107:      * Removes given tags from the HTML source.
108:      * If no tags are given, the default set is used.
109:      * Expects valid HTML code.
110:      *
111:      * @param string $html HTML source code
112:      * @param array $tags Tags to be removed
113:      * @return string
114:      */
115:     public static function removeTags($html, array $tags = array())
116:     {
117:         // Default set of tags
118:         static $default = array(
119:             'frameset', 'frame', 'noframes', 'iframe', 'script', 'noscript', 'style', 'link',
120:             'object', 'embed', 'form', 'input', 'select', 'textarea', 'button'
121:         );
122: 
123:         // If no tags are set, the default set will be used
124:         if (empty($tags)) {
125:             $tags = $default;
126:         }
127: 
128:         // Remove given tags
129:         foreach ($tags as $tag) {
130:             switch ($tag) {
131:                 // Embed
132:                 case 'embed':
133:                     // Second variant is because of Tidy that processes <embed> this way
134:                     $pattern = array('~\s*<embed[^>]*>.*?</embed>~is', '~\s*<embed[^>]*>~is');
135:                     break;
136:                 // Self closing tags
137:                 case 'link':
138:                 case 'meta':
139:                 case 'br':
140:                 case 'hr':
141:                 case 'img':
142:                 case 'input':
143:                     $pattern = array('~\s*<' . $tag . '[^>]*>~is');
144:                     break;
145:                 // Pair tags
146:                 default:
147:                     $pattern = array('~\s*<' . $tag . '(?:\s+[^>]*)?>.*?</' . $tag . '>~is');
148:                     break;
149:             }
150: 
151:             $html = preg_replace($pattern, '', $html);
152:         }
153: 
154:         return $html;
155:     }
156: 
157:     /**
158:      * Removes tags of the same type nested into each other from the HTML source.
159:      * Expects valid HTML source
160:      *
161:      * @param string $html HTML source code
162:      * @param string $tag Tags to be processed
163:      * @return string
164:      */
165:     public static function removeInnerTags($html, $tag)
166:     {
167:         $tag = (string) $tag;
168: 
169:         if (preg_match_all('~(?:<' . $tag . '>)|(?:</' . $tag . '>)|(?:<[^>]+>)|(?:[^<]+)~i', $html, $matches)) {
170:             $html = '';
171:             $level = 0;
172:             foreach ($matches[0] as $htmlPart) {
173:                 if (0 === stripos($htmlPart, '<' . $tag)) {
174:                     $level++;
175:                     if (1 === $level) {
176:                         $html .= $htmlPart;
177:                     }
178:                 } elseif (0 === stripos($htmlPart, '</' . $tag)) {
179:                     if (1 === $level) {
180:                         $html .= $htmlPart;
181:                     }
182:                     $level--;
183:                 } else {
184:                     $html .= $htmlPart;
185:                 }
186:             }
187:         }
188: 
189:         return $html;
190:     }
191: 
192:     /**
193:      * Removes given attributes from the HTML source.
194:      * If no attributes are given, the default set will be used.
195:      * Expects valid HTML source.
196:      *
197:      * @param string $html HTML source code
198:      * @param array $attributes Attributes to be removed
199:      * @return string
200:      */
201:     public static function removeAttributes($html, array $attributes = array())
202:     {
203:         // Default set of attributes
204:         static $default = array('id', 'class');
205: 
206:         // If no attributes are given, the default set will be used
207:         if (empty($attributes)) {
208:             $attributes = $default;
209:         }
210: 
211:         // Remove given attributes
212:         foreach ($attributes as $attribute) {
213:             $html = preg_replace('~(<[a-z][a-z0-9]*[^>]*?)\s+' . $attribute . '="[^"]*"~is', '\1', $html);
214:         }
215: 
216:         return $html;
217:     }
218: 
219:     /**
220:      * Removes all javascript events from the HTML source.
221:      * If it is necessary to remove only certain events, the removeAttributes() method can be used.
222:      * Expects valid HTML source.
223:      *
224:      * @param string $html HTML source code
225:      * @return string
226:      */
227:     public static function removeJavascriptEvents($html)
228:     {
229:         // A tag can have multiple events, therefore it is necessary to process the source multiple times
230:         while (preg_match('~<[a-z][a-z0-9]*[^>]*?\s+on[a-z]+="[^"]*"~is', $html)) {
231:             $html = preg_replace('~(<[a-z][a-z0-9]*[^>]*?)\s+on[a-z]+="[^"]*"~is', '\1', $html);
232:         }
233:         return $html;
234:     }
235: 
236:     /**
237:      * Removes foreign images from the HTML source.
238:      * Keeps <img> tags (only set the value about:blank into its src attribute), because removing the tag entirely could affect
239:      * the page layout.
240:      * Expects valid HTML source.
241:      *
242:      * @param string $html HTML source code
243:      * @return string
244:      */
245:     public static function removeRemoteImages($html)
246:     {
247:         static $remoteImages = array(
248:             '~(<img[^>]+src=")http(?:s)?://[^"]+(")~i',
249:             '~(<[a-z][a-z0-9]*[^>]+background=")http(?:s)?://[^"]+(")~i',
250:             '~(<[a-z][a-z0-9]*[^>]+style="[^"]*background\s*[:])([-a-z0-9#%\s]*)url\([^)]+\)(;)?~is',
251:             '~(<[a-z][a-z0-9]*[^>]+style="[^"]*)background-image\s*[:]([-a-z0-9#%\s]*)url\([^)]+\)(;)?~is',
252:             '~(<[a-z][a-z0-9]*[^>]+style="[^"]*list-style\s*[:])([-a-z0-9\s]*)url\([^)]+\)(;)?~is',
253:             '~(<[a-z][a-z0-9]*[^>]+style="[^"]*)list-style-image\s*[:]([-a-z0-9\s]*)url\([^)]+\)(;)?~is'
254:         );
255:         // We use value about:blank for the <img> tag's src attribute, because removing the tag entirely could affect the page layout
256:         static $remoteImagesReplacement = array(
257:             '\1about:blank\2',
258:             '\1\2',
259:             '\1\2\3',
260:             '\1',
261:             '\1\2\3',
262:             '\1'
263:         );
264: 
265:         return preg_replace($remoteImages, $remoteImagesReplacement, $html);
266:     }
267: 
268:     /**
269:      * Removes possibly dangerous attributes that could contain XSS code from the HTML source.
270:      *
271:      * @param string $html HTML source code
272:      * @return string
273:      */
274:     public static function removeDangerous($html)
275:     {
276:         static $dangerous = array(
277:             '~\s+href="javascript[^"]*"~i',
278:             '~\s+src="javascript[^"]*"~i',
279:             '~\s+href="data:[^"]*"~i',  // See http://www.soom.cz/index.php?name=projects/testmail/main
280:             '~\s+src="data:[^"]*"~i'
281:         );
282: 
283:         return preg_replace($dangerous, '', $html);
284:     }
285: 
286:     /**
287:      * Returns <body> contents from the given HTML source.
288:      * Expects valid HTML source.
289:      *
290:      * @param string $html HTML source code
291:      * @return string
292:      */
293:     public static function getBody($html)
294:     {
295:         // If the source code contains <body>, return this element's contents
296:         if (preg_match('~<body([^>]*)>(.*?)</body>~is', $html, $matches)) {
297:             $body = trim($matches[2]);
298: 
299:             // Converts <body> inline styles to a newly created <div> element
300:             if (preg_match('~style="[^"]+"~i', $matches[1], $style)) {
301:                 $body = '<div ' . $style[0] . '>' . $body . '</div>';
302:             }
303: 
304:             return $body;
305:         }
306: 
307:         // Return everything otherwise
308:         return $html;
309:     }
310: 
311:     /**
312:      * Converts text to HTML source code.
313:      *
314:      * @param string $text Input text
315:      * @param boolean $convertLinks Convert urls and emails to links
316:      * @return string
317:      */
318:     public static function fromText($text, $convertLinks = true)
319:     {
320:         // Trimming whitespace (except spaces)
321:         $text = trim($text, "\r\n");
322: 
323:         // Two empty lines max
324:         $text = preg_replace("~\n\s+\n~", "\n\n", $text);
325: 
326:         // Special chars
327:         $html = htmlspecialchars($text, ENT_QUOTES, 'utf-8', false);
328: 
329:         // Two spaces mean an indent, convert to non-breaking spaces
330:         $html = str_replace('  ', '&nbsp;&nbsp;', $html);
331:         // Convert tabs to four non-breaking spaces
332:         $html = str_replace("\t", '&nbsp;&nbsp;&nbsp;&nbsp;', $html);
333: 
334:         // Paragraph
335:         $html = '<p>' . preg_replace("~\n\n[^\\n]?~", '</p><p>\0', $html) . '</p>';
336:         $html = str_replace("\n", "<br />\n", $html);
337:         $html = str_ireplace('<p><br />', "<p>\n", $html);
338: 
339:         // Citation
340:         preg_match_all('~(?:(^(?:<p>)?\s*&gt;(?:&gt;|\s)*)(.*)$)|(?:.+)~im', $html, $matches);
341:         $html = '';
342:         $offset = 0;
343:         for ($i = 0; $i < count($matches[0]); $i++) {
344:             $currentOffset = substr_count($matches[1][$i], '&gt;');
345:             if ($currentOffset > 0) {
346:                 if ($currentOffset > $offset) {
347:                     $html .= str_repeat('<blockquote type="cite">', $currentOffset - $offset) . '<p>';
348:                     $offset = $currentOffset;
349:                 } elseif ($currentOffset < $offset) {
350:                     $html .= '</p>' . str_repeat('</blockquote>', $offset - $currentOffset) . '<p>';
351:                     $offset = $currentOffset;
352:                 }
353: 
354:                 $html .= $matches[2][$i];
355:             } else {
356:                 if ($offset > 0) {
357:                     $html .= '</p>' . str_repeat('</blockquote>', $offset) . '<p>';
358:                     $offset = 0;
359:                 }
360: 
361:                 $html .= $matches[0][$i];
362:             }
363:         }
364:         if ($offset > 0) {
365:             $html .= '</p>' . str_repeat('</blockquote>', $offset);
366:         }
367: 
368:         // Removes empty lines that were created during previous processing
369:         $html = preg_replace('~(?:<br />)+</p></blockquote>~i', '</p></blockquote>', $html);
370:         $html = str_ireplace('<p><br /></p>', '', $html);
371:         $html = str_ireplace('<p><br />', '<p>', $html);
372: 
373:         // Emails and urls
374:         if ($convertLinks) {
375:             $html = self::linkFromText($html);
376:         }
377: 
378:         return $html;
379:     }
380: 
381:     /**
382:      * Converts text to a link to an url or email.
383:      *
384:      * @param string $text Input text
385:      * @return string
386:      */
387:     public static function linkFromText($text)
388:     {
389:         $patternGenericTld = '(?:tld|aero|biz|cat|com|coop|edu|gov|info|int|jobs|mil|mobi|museum|name|net|org|pro|tel|travel|asia|post|geo)';
390:         $patternTld = '(?-i:' . $patternGenericTld . '|[a-z]{2})';
391:         $patternDomain = '(?:(?:[a-z]|[a-z0-9](?:[-a-z0-9]{0,61}[a-z0-9]))[.])*(?:[a-z0-9](?:[-a-z0-9]{0,61}[a-z0-9])[.]' . $patternTld . ')';
392: 
393:         $pattern8bit = '(?:25[0-5]|2[0-4][0-9]|[0-1]?[0-9]?[0-9])';
394:         $patternIPv4 = '(?:' . $pattern8bit . '(?:[.]' . $pattern8bit . '){3})';
395: 
396:         // a:b:c:d:e:f:g:h
397:         $patternIpV6Variant8Hex = '(?:(?:[0-9a-f]{1,4}:){7}[0-9a-f]{1,4})';
398:         // Compressed a::b
399:         $patternIpV6VariantCompressedHex = '(?:(?:(?:[0-9a-f]{1,4}(?::[0-9a-f]{1,4})*)?)::(?:(?:[0-9a-f]{1,4}(?::[0-9a-f]{1,4})*)?))';
400:         // IPv4 mapped to  IPv6 a:b:c:d:e:f:w.x.y.z
401:         $patternIpV6VariantHex4Dec = '(?:(?:(?:[0-9a-f]{1,4}:){6})' . $patternIPv4 . ')';
402:         // Compressed IPv4 mapped to IPv6 a::b:w.x.y.z
403:         $patternIpV6VariantCompressedHex4Dec = '(?:(?:(?:[0-9a-f]{1,4}(?::[0-9a-f]{1,4})*)?)::(?:(?:[0-9a-f]{1,4}:)*)' . $patternIPv4 . ')';
404:         $patternIpV6 = '(?:' . $patternIpV6Variant8Hex . '|' . $patternIpV6VariantCompressedHex . '|' . $patternIpV6VariantHex4Dec . '|' . $patternIpV6VariantCompressedHex4Dec . ')';
405: 
406:         // mailto:username
407:         $patternEmail = '(?:mailto:)?(?:[-\w!#\$%&\'*+/=?^`{|}\~]+(?:[.][-\w!#\$%&\'*+/=?^`{|}\~]+)*)';
408:         // @domain.tld
409:         $patternEmail .= '(?:@' . $patternDomain . ')';
410: 
411:         // protocol://user:password@
412:         $patternUrl = '(?:(?:http|ftp)s?://(?:[\S]+(?:[:][\S]*)?@)?)?';
413:         // domain.tld, IPv4 or IPv6
414:         $patternUrl .= '(?:' . $patternDomain . '|' . $patternIPv4 . '|' . $patternIpV6 . ')';
415:         // :port/path/file.extension
416:         $patternUrl .= '(?::[0-9]+)?(?:(?:/[-\w\pL\pN\~.:!%]+)*(?:/|[.][a-z0-9]{2,4})?)?';
417:         // ?query#hash
418:         $patternUrl .= '(?:[?][\]\[-\w\pL\pN.,?!\~%#@&;:/\'\=+]*)?(?:#[\]\[-\w\pL\pN.,?!\~%@&;:/\'\=+]*)?';
419: 
420:         return preg_replace_callback('~(^|[^\pL\pN])(?:(' . $patternEmail . ')|(' . $patternUrl . '))(?=$|\W)~iu', function($matches) {
421:             // Url
422:             if (isset($matches[3])) {
423:                 $url = $matches[3];
424:                 // Remove special chars at the end
425:                 if (preg_match('~(([.,:;?!>)\]}]|(&gt;))+)$~i', $url, $matches2)) {
426:                     $punctuation = $matches2[1];
427:                     // strlen is necessary because of &gt;
428:                     $url = mb_substr($url, 0, -strlen($matches2[1]), 'utf-8');
429:                 } else {
430:                     $punctuation = '';
431:                 }
432: 
433:                 // Add missing http://
434:                 $linkUrl = !preg_match('~^(http|ftp)s?://~i', $url) ? 'http://' .  $url : $url;
435: 
436:                 // Create a link
437:                 return $matches[1] . '<a href="' . $linkUrl . '">' . $url . '</a>' . $punctuation;
438:             }
439: 
440:             // Emails
441:             if (isset($matches[2])) {
442:                 $email = $matches[2];
443:                 if (false !== stripos($email, 'mailto:')) {
444:                     $email = substr($matches[2], 7);
445:                     $protocol = 'mailto:';
446:                 } else {
447:                     $protocol = '';
448:                 }
449:                 return $matches[1] . '<a href="mailto:' . $email . '">' . $protocol . $email . '</a>';
450:             }
451:         }, $text);
452:     }
453: 
454:     /**
455:      * Converts HTML source code to plaintext.
456:      *
457:      * @param string $html HTML source code
458:      * @return string
459:      */
460:     public static function toText($html)
461:     {
462:         $text = $html;
463: 
464:         // Re-format lines
465:         // <pre>
466:         $text = preg_replace_callback('~<pre[^>]*>(.+?)</pre>~is', function($matches) {
467:             // Line breaks are converted to <br />, that are removed later
468:             return nl2br($matches[1]);
469:         }, $text);
470:         // \r, redundant line breaks, tabs and <br />
471:         $text = preg_replace(
472:             array("~\r~", "~[\n\t]+~", '~<br[^>]*>~i'),
473:             array('', ' ', "\n"),
474:             $text
475:         );
476: 
477:         // Processing of most tags and entities
478:         static $search = array(
479:             '~<h[3-6][^>]*>(.+?)</h[3-6]>~is',  // <h3> to <h6>
480:             '~(<div[^>]*>)|(</div>)~i',         // <div> and </div>
481:             '~(<p(?:\s+[^>]+)?>)|(</p>)~i',     // <p> and </p>
482:             '~(<table[^>]*>)|(</table>)~i',     // <table> and </table>
483:             '~</tr>*~i',                        // </tr>
484:             '~<td[^>]*>(.+?)</td>~is',          // <td> and </td>
485:             // '~(<code[^>]*>)|(</code>)~i',    // <code> and </code>
486:             '~(&hellip;)~i',                    // Ellipsis
487:             '~(&#8220;)|(&#8221;)~i',           // Quotes
488:             '~(&apos;)~i',                      // Apostrophe
489:             '~(&copy;)|(&#169;)~i',             // Copyright
490:             '~&trade;~i',                       // Trademark
491:             '~&reg;~i',                         // Registered trademark
492:             '~(&mdash;)|(&ndash;)~i'            // Dash and hyphen
493:         );
494:         static $replace = array(
495:             "\n\n\\1\n\n",  // <h3> to <h6>
496:             "\n\n",         // <div> and </div>
497:             "\n\n",         // <p> and </p>
498:             "\n\n",         // <table> and </table>
499:             "\n",           // </tr>
500:             "\\1\t",        // <td> and </td>
501:             // "\n\n",      // <code> and </code>
502:             '...',          // Ellipsis
503:             '"',            // Quotes
504:             '\'',           // Apostrophe
505:             '(c)',          // Copyright
506:             '(tm)',         // Trademark
507:             '(R)',          // Registered trademark
508:             '-'             // Dash and hyphen
509:         );
510:         $text = preg_replace($search, $replace, $text);
511: 
512:         // <h1> and <h2>
513:         $text = preg_replace_callback('~<h[12][^>]*>(.+?)</h[12]>~is', function($matches) {
514:             return "\n\n\n" . mb_strtoupper($matches[1], 'utf-8') . "\n\n";
515:         }, $text);
516:         // <strong>
517:         $text = preg_replace_callback('~<strong[^>]*>(.+?)</strong>~is', function($matches) {
518:             return mb_strtoupper($matches[1], 'utf-8');
519:         }, $text);
520:         // <hr />
521:         $text = preg_replace_callback('~<hr[^>]*>~i', function($matches) {
522:             return "\n" . str_repeat('-', 50) . "\n";
523:         }, $text);
524:         // <th>
525:         $text = preg_replace_callback('~<th[^>]*>(.+?)</th>~is', function($matches) {
526:             return mb_strtoupper($matches[1], 'utf-8') . "\t";
527:         }, $text);
528:         // <a>
529:         $text = self::linkToText($text);
530:         // <ul> and <ol>
531:         $text = self::listToText($text);
532: 
533:         // Two empty lines at most
534:         $text = trim($text, "\n ");
535:         $text = preg_replace("~\n\s+\n~", "\n\n", $text);
536: 
537:         // Process <blockquote> (empty lines are removed before <blockquote> processing on purpose)
538:         $text = self::blockquoteToText($text);
539: 
540:         // Remove all left tags
541:         $text = strip_tags($text);
542: 
543:         // Replacing [textlink] for <> (must be done after strip_tags)
544:         $text = preg_replace('~\[textlink\]\s*~s', '<', $text);
545:         $text = preg_replace('~\s*\[/textlink\]~s', '>', $text);
546: 
547:         // Replaces non-breaking spaces
548:         $text = preg_replace(array('~&nbsp;&nbsp;&nbsp;&nbsp;~i', '~&nbsp;~i'), array("\t", ' '), $text);
549: 
550:         // Remove other entities (must not be performed before)
551:         // After previous processing some entities are upper case, that is why we have to use strtolower
552:         $text = preg_replace_callback('~(&#?[a-z0-9]+;)~i', function($matches) {
553:             return html_entity_decode(strtolower($matches[1]), ENT_QUOTES, 'utf-8');
554:         }, $text);
555: 
556:         // Two empty lines at most (performed second times on purpose)
557:         $text = trim($text, "\n ");
558:         $text = preg_replace("~\n\s+\n~", "\n\n", $text);
559:         // Because of <blockquote> converting
560:         $text = preg_replace("~(\n>\s*)+\n~", "\n>\n", $text);
561: 
562:         // One space at most
563:         $text = preg_replace("~(\n|\t)( )+~", '\1', $text);
564:         $text = preg_replace('~( ){2,}~', ' ', $text);
565: 
566:         // No space at line ends
567:         $text = preg_replace("~[ \t]+\n~", "\n", $text);
568: 
569:         return $text;
570:     }
571: 
572:     /**
573:      * Converts HTML links into plaintext.
574:      *
575:      * @param string $text Text with HTML fragments
576:      * @return string
577:      */
578:     private static function linkToText($text)
579:     {
580:         return preg_replace_callback('~<a\s+(?:[^>]+\s+)*href\s*=\s*"([^"]+)"(?:\s+[^>]*)?>(.+?)</a>~is', function($matches) {
581:             $url = trim($matches[1]);
582:             $content = $matches[2];
583:             $clearContent = trim(strip_tags($content));
584: 
585:             // Some urls have no real meaning
586:             if ((empty($url)) || ('#' === $url[0]) || ('/?' === substr($url, 0, 2))) {
587:                 return $content;
588:             }
589: 
590:             // Invalid url gets ignored
591:             if (!Jyxo_Input_Validator_IsUrl::validate($url)) {
592:                 return $content;
593:             }
594: 
595:             // If the link text and target are the same, use only one of them
596:             if ($url === $clearContent) {
597:                 return '[textlink]' . $content . '[/textlink]';
598:             } else {
599:                 return $content . ' [textlink]' . $url . '[/textlink]';
600:             }
601:         }, $text);
602:     }
603: 
604:     /**
605:      * Converts HTML lists to plaintext.
606:      *
607:      * @param string $text Text with HTML fragments
608:      * @return string
609:      */
610:     private static function listToText($text)
611:     {
612:         static $symbols = array('#', '*', 'o', '+');
613: 
614:         preg_match_all('~(?:<[a-z][a-z0-9]*[^>]*(?: /)?>)|(?:</[a-z][a-z0-9]*>)|(?:<![^>]+>)|(?:[^<]+)~i', $text, $matches);
615:         $text = '';
616:         $ulLevel = 0;
617:         $olLevel = 0;
618:         $olLiCount = array();
619:         $path = array();
620: 
621:         foreach ($matches[0] as $textPart) {
622:             if (0 === stripos($textPart, '<ol')) {
623:                 array_push($path, 'ol');
624:                 $olLevel++;
625:                 $olLiCount[$olLevel] = 1;
626:                 $textPart = "\n\n";
627:             } elseif ('</ol>' === strtolower($textPart)) {
628:                 array_pop($path);
629:                 $olLevel--;
630:                 $textPart = "\n\n";
631:             } elseif (0 === stripos($textPart, '<ul')) {
632:                 array_push($path, 'ul');
633:                 $ulLevel++;
634:                 $textPart = "\n\n";
635:             } elseif ('</ul>' === strtolower($textPart)) {
636:                 array_pop($path);
637:                 $ulLevel--;
638:                 $textPart = "\n\n";
639:             } elseif (0 === stripos($textPart, '<li')) {
640:                 $textPart = str_repeat("\t", $olLevel + $ulLevel);
641:                 if ('ul' === end($path)) {
642:                     $textPart .= $symbols[$ulLevel % 4] . ' ';
643:                 } elseif ('ol' === end($path)) {
644:                     $textPart .= $olLiCount[$olLevel] . '. ';
645:                     $olLiCount[$olLevel]++;
646:                 }
647:             } elseif ('</li>' === strtolower($textPart)) {
648:                 $textPart = "\n";
649:             }
650: 
651:             $text .= $textPart;
652:         }
653: 
654:         return $text;
655:     }
656: 
657:     /**
658:      * Converts citations into plaintext.
659:      *
660:      * @param string $text Text with HTML fragments
661:      * @return string
662:      */
663:     private static function blockquoteToText($text)
664:     {
665:         if (preg_match_all('~(?:<blockquote[^>]*>\s*)|(?:\s*</blockquote>)|(?:.+?(?=</?blockquote)|(?:.+))~is', $text, $matches) > 0) {
666:             $text = '';
667:             $offset = 0;
668:             foreach ($matches[0] as $textPart) {
669:                 if (($currentOffset = substr_count(strtolower($textPart), '<blockquote')) > 0) {
670:                     $offset += $currentOffset;
671:                     $textPart = ($offset == 1 ? "\n" : ''); // Adds a line to the beginning
672:                 } elseif (($currentOffset = substr_count(strtolower($textPart), '</blockquote>')) > 0) {
673:                     $offset -= $currentOffset;
674:                     $textPart = '';
675:                 } elseif ($offset > 0) {
676:                     $textPart = "\n" . str_repeat('>', $offset) . ' '   // Opening tag
677:                         . str_replace("\n", "\n" . str_repeat('>', $offset) . ' ', trim($textPart)) // Beginning of all lines
678:                         . "\n" . str_repeat('>', $offset);  // Closing tag
679:                 }
680: 
681:                 $text .= $textPart;
682:             }
683:         }
684: 
685:         return $text;
686:     }
687: }
688:
Packages

Classes