File Html.php | Jyxo PHP Library

  1: <?php
  2: 
  3: /**
  4:  * Jyxo PHP Library
  5:  *
  6:  * LICENSE
  7:  *
  8:  * This source file is subject to the new BSD license that is bundled
  9:  * with this package in the file license.txt.
 10:  * It is also available through the world-wide-web at this URL:
 11:  * https://github.com/jyxo/php/blob/master/license.txt
 12:  */
 13: 
 14: namespace Jyxo;
 15: 
 16: /**
 17:  * Functions for HTML processing.
 18:  *
 19:  * @category Jyxo
 20:  * @package Jyxo\Html
 21:  * @copyright Copyright (c) 2005-2011 Jyxo, s.r.o.
 22:  * @license https://github.com/jyxo/php/blob/master/license.txt
 23:  * @author Jaroslav Hanslík
 24:  */
 25: class Html
 26: {
 27:     /**
 28:      * Constructor preventing from creating instances of a static class.
 29:      *
 30:      * @throws \LogicException If trying to create an instance
 31:      */
 32:     public final function __construct()
 33:     {
 34:         throw new \LogicException(sprintf('Cannot create an instance of a static class %s.', get_class($this)));
 35:     }
 36: 
 37:     /**
 38:      * Tests if the given text contains at least one HTML tag.
 39:      * It is just an estimation.
 40:      *
 41:      * @param string $text Input text to be tested
 42:      * @return boolean
 43:      */
 44:     public static function is($text)
 45:     {
 46:         return (bool) preg_match('~<[a-z][a-z0-9]*(\s[^<]*)?>~i', $text);
 47:     }
 48: 
 49:     /**
 50:      * Fixes an invalid HTML source, unifies quotes and removes unnecessary whitespace.
 51:      * Required the Tidy PHP extension.
 52:      *
 53:      * @param string $html Input HTML source
 54:      * @return string
 55:      */
 56:     public static function repair($html)
 57:     {
 58:         // HTML fixing
 59:         static $config = array(
 60:             'newline' => 'LF',              // Uses LF line endings
 61:             'indent' => false,              // Removes indent
 62:             'output-xhtml' => true,         // Output will be in XHTML format
 63:             'output-bom' => false,          // No BOM
 64:             'doctype' => 'auto',            // Automatic doctype
 65:             // 'clean' => true,             // Removes presentation tags (inline styles would be moved into <style> elements)
 66:             'bare' => true,                 // Cleans MS HTML mess
 67:             'wrap' => 0,                    // No wrapping
 68:             'wrap-sections' => false,       // No <![ ... ]> wrapping
 69:             // 'quote-marks' => true,       // Replaces quotes with appropriate entities (causes problems with later regular expression processing)
 70:             // 'logical-emphasis' => true,  // Replaces all <i> and <b> tags with <em> and <strong> (styles cannot be parsed after)
 71:             'enclose-text' => true,         // Text inside <body> encapsulates with a <p> tag
 72:             'merge-divs' => false,          // Disables <div> merging
 73:             'merge-spans' => false,         // Disables <span> merging
 74:             // 'hide-comments' => true,     // Removes comments (it would remove conditional comments used when inserting Flash)
 75:             'force-output' => true,         // Makes output even on error
 76:             'show-errors' => 0,             // Don't show any errors
 77:             'show-warnings' => false,       // Don't show any warnings
 78:             'escape-cdata' => true,         // Makes an ordinary text from CDATA blocks
 79:             'preserve-entities' => true     // Preserves correctly formatted entities
 80:             // 'drop-proprietary-attributes' => true,   // Removes proprietary attributes (it would remove e.g. the background attribute)
 81:             // 'drop-font-tags' => true     // Removes <FONT> and <CENTER> tags
 82:         );
 83:         $html = tidy_repair_string($html, $config, 'utf8');
 84: 
 85:         // Removes namespace <?xml:namespace prefix = o ns = "urn:schemas-microsoft-com:office:office" /? > generated by MS Word
 86:         $html = preg_replace('~<\?xml:namespace[^>]*>~i', '', $html);
 87: 
 88:         // Removes unnecessary line breaks and keeps them inside <pre> elements
 89:         // Tidy adds one more line breaks inside <pre> elements
 90:         $html = preg_replace("~(<pre[^>]*>)\n~", '\1', $html);
 91:         $html = preg_replace("~\n</pre>~", '</pre>', $html);
 92:         $html = preg_replace_callback('~(<pre[^>]*>)(.+?)(</pre>)~s', function($matches) {
 93:             return $matches[1] . strtr(nl2br($matches[2]), array('\"' => '"')) . $matches[3];
 94:         }, $html);
 95:         // Strip line breaks
 96:         $html = strtr($html, array("\r" => '', "\n" => ''));
 97: 
 98:         // Replace single quotes with double quotes (for easier processing later)
 99:         $html = preg_replace('~(<[a-z][a-z0-9]*[^>]+[a-z]+=)\'([^\']*)\'~i', '\1"\2"', $html);
100: 
101:         // Remove unnecessary spaces inside elements (for easier processing later)
102:         $html = preg_replace('~(<[a-z][a-z0-9]*[^>]+[a-z]+=")\s+([^"]*")~i', '\1\2', $html);
103:         $html = preg_replace('~(<[a-z][a-z0-9]*[^>]+[a-z]+="[^"]*)\s+(")~i', '\1\2', $html);
104: 
105:         return $html;
106:     }
107: 
108:     /**
109:      * Removes given tags from the HTML source.
110:      * If no tags are given, the default set is used.
111:      * Expects valid HTML code.
112:      *
113:      * @param string $html HTML source code
114:      * @param array $tags Tags to be removed
115:      * @return string
116:      */
117:     public static function removeTags($html, array $tags = array())
118:     {
119:         // Default set of tags
120:         static $default = array(
121:             'frameset', 'frame', 'noframes', 'iframe', 'script', 'noscript', 'style', 'link',
122:             'object', 'embed', 'form', 'input', 'select', 'textarea', 'button'
123:         );
124: 
125:         // If no tags are set, the default set will be used
126:         if (empty($tags)) {
127:             $tags = $default;
128:         }
129: 
130:         // Remove given tags
131:         foreach ($tags as $tag) {
132:             switch ($tag) {
133:                 // Embed
134:                 case 'embed':
135:                     // Second variant is because of Tidy that processes <embed> this way
136:                     $pattern = array('~\s*<embed[^>]*>.*?</embed>~is', '~\s*<embed[^>]*>~is');
137:                     break;
138:                 // Self closing tags
139:                 case 'link':
140:                 case 'meta':
141:                 case 'br':
142:                 case 'hr':
143:                 case 'img':
144:                 case 'input':
145:                     $pattern = array('~\s*<' . $tag . '[^>]*>~is');
146:                     break;
147:                 // Pair tags
148:                 default:
149:                     $pattern = array('~\s*<' . $tag . '(?:\s+[^>]*)?>.*?</' . $tag . '>~is');
150:                     break;
151:             }
152: 
153:             $html = preg_replace($pattern, '', $html);
154:         }
155: 
156:         return $html;
157:     }
158: 
159:     /**
160:      * Removes tags of the same type nested into each other from the HTML source.
161:      * Expects valid HTML source
162:      *
163:      * @param string $html HTML source code
164:      * @param string $tag Tags to be processed
165:      * @return string
166:      */
167:     public static function removeInnerTags($html, $tag)
168:     {
169:         $tag = (string) $tag;
170: 
171:         if (preg_match_all('~(?:<' . $tag . '>)|(?:</' . $tag . '>)|(?:<[^>]+>)|(?:[^<]+)~i', $html, $matches)) {
172:             $html = '';
173:             $level = 0;
174:             foreach ($matches[0] as $htmlPart) {
175:                 if (0 === stripos($htmlPart, '<' . $tag)) {
176:                     $level++;
177:                     if (1 === $level) {
178:                         $html .= $htmlPart;
179:                     }
180:                 } elseif (0 === stripos($htmlPart, '</' . $tag)) {
181:                     if (1 === $level) {
182:                         $html .= $htmlPart;
183:                     }
184:                     $level--;
185:                 } else {
186:                     $html .= $htmlPart;
187:                 }
188:             }
189:         }
190: 
191:         return $html;
192:     }
193: 
194:     /**
195:      * Removes given attributes from the HTML source.
196:      * If no attributes are given, the default set will be used.
197:      * Expects valid HTML source.
198:      *
199:      * @param string $html HTML source code
200:      * @param array $attributes Attributes to be removed
201:      * @return string
202:      */
203:     public static function removeAttributes($html, array $attributes = array())
204:     {
205:         // Default set of attributes
206:         static $default = array('id', 'class');
207: 
208:         // If no attributes are given, the default set will be used
209:         if (empty($attributes)) {
210:             $attributes = $default;
211:         }
212: 
213:         // Remove given attributes
214:         foreach ($attributes as $attribute) {
215:             $html = preg_replace('~(<[a-z][a-z0-9]*[^>]*?)\s+' . $attribute . '="[^"]*"~is', '\1', $html);
216:         }
217: 
218:         return $html;
219:     }
220: 
221:     /**
222:      * Removes all javascript events from the HTML source.
223:      * If it is necessary to remove only certain events, the removeAttributes() method can be used.
224:      * Expects valid HTML source.
225:      *
226:      * @param string $html HTML source code
227:      * @return string
228:      */
229:     public static function removeJavascriptEvents($html)
230:     {
231:         // A tag can have multiple events, therefore it is necessary to process the source multiple times
232:         while (preg_match('~<[a-z][a-z0-9]*[^>]*?\s+on[a-z]+="[^"]*"~is', $html)) {
233:             $html = preg_replace('~(<[a-z][a-z0-9]*[^>]*?)\s+on[a-z]+="[^"]*"~is', '\1', $html);
234:         }
235:         return $html;
236:     }
237: 
238:     /**
239:      * Removes foreign images from the HTML source.
240:      * Keeps <img> tags (only set the value about:blank into its src attribute), because removing the tag entirely could affect
241:      * the page layout.
242:      * Expects valid HTML source.
243:      *
244:      * @param string $html HTML source code
245:      * @return string
246:      */
247:     public static function removeRemoteImages($html)
248:     {
249:         static $remoteImages = array(
250:             '~(<img[^>]+src=")http(?:s)?://[^"]+(")~i',
251:             '~(<[a-z][a-z0-9]*[^>]+background=")http(?:s)?://[^"]+(")~i',
252:             '~(<[a-z][a-z0-9]*[^>]+style="[^"]*background\s*[:])([-a-z0-9#%\s]*)url\([^)]+\)(;)?~is',
253:             '~(<[a-z][a-z0-9]*[^>]+style="[^"]*)background-image\s*[:]([-a-z0-9#%\s]*)url\([^)]+\)(;)?~is',
254:             '~(<[a-z][a-z0-9]*[^>]+style="[^"]*list-style\s*[:])([-a-z0-9\s]*)url\([^)]+\)(;)?~is',
255:             '~(<[a-z][a-z0-9]*[^>]+style="[^"]*)list-style-image\s*[:]([-a-z0-9\s]*)url\([^)]+\)(;)?~is'
256:         );
257:         // We use value about:blank for the <img> tag's src attribute, because removing the tag entirely could affect the page layout
258:         static $remoteImagesReplacement = array(
259:             '\1about:blank\2',
260:             '\1\2',
261:             '\1\2\3',
262:             '\1',
263:             '\1\2\3',
264:             '\1'
265:         );
266: 
267:         return preg_replace($remoteImages, $remoteImagesReplacement, $html);
268:     }
269: 
270:     /**
271:      * Removes possibly dangerous attributes that could contain XSS code from the HTML source.
272:      *
273:      * @param string $html HTML source code
274:      * @return string
275:      */
276:     public static function removeDangerous($html)
277:     {
278:         static $dangerous = array(
279:             '~\s+href="javascript[^"]*"~i',
280:             '~\s+src="javascript[^"]*"~i',
281:             '~\s+href="data:[^"]*"~i',  // See http://www.soom.cz/index.php?name=projects/testmail/main
282:             '~\s+src="data:[^"]*"~i'
283:         );
284: 
285:         return preg_replace($dangerous, '', $html);
286:     }
287: 
288:     /**
289:      * Returns <body> contents from the given HTML source.
290:      * Expects valid HTML source.
291:      *
292:      * @param string $html HTML source code
293:      * @return string
294:      */
295:     public static function getBody($html)
296:     {
297:         // If the source code contains <body>, return this element's contents
298:         if (preg_match('~<body([^>]*)>(.*?)</body>~is', $html, $matches)) {
299:             $body = trim($matches[2]);
300: 
301:             // Converts <body> inline styles to a newly created <div> element
302:             if (preg_match('~style="[^"]+"~i', $matches[1], $style)) {
303:                 $body = '<div ' . $style[0] . '>' . $body . '</div>';
304:             }
305: 
306:             return $body;
307:         }
308: 
309:         // Return everything otherwise
310:         return $html;
311:     }
312: 
313:     /**
314:      * Converts text to HTML source code.
315:      *
316:      * @param string $text Input text
317:      * @param boolean $convertLinks Convert urls and emails to links
318:      * @return string
319:      */
320:     public static function fromText($text, $convertLinks = true)
321:     {
322:         // Trimming whitespace (except spaces)
323:         $text = trim($text, "\r\n");
324: 
325:         // Two empty lines max
326:         $text = preg_replace("~\n\s+\n~", "\n\n", $text);
327: 
328:         // Special chars
329:         $html = htmlspecialchars($text, ENT_QUOTES, 'utf-8', false);
330: 
331:         // Two spaces mean an indent, convert to non-breaking spaces
332:         $html = str_replace('  ', '&nbsp;&nbsp;', $html);
333:         // Convert tabs to four non-breaking spaces
334:         $html = str_replace("\t", '&nbsp;&nbsp;&nbsp;&nbsp;', $html);
335: 
336:         // Paragraph
337:         $html = '<p>' . preg_replace("~\n\n[^\\n]?~", '</p><p>\0', $html) . '</p>';
338:         $html = str_replace("\n", "<br />\n", $html);
339:         $html = str_ireplace('<p><br />', "<p>\n", $html);
340: 
341:         // Citation
342:         preg_match_all('~(?:(^(?:<p>)?\s*&gt;(?:&gt;|\s)*)(.*)$)|(?:.+)~im', $html, $matches);
343:         $html = '';
344:         $offset = 0;
345:         for ($i = 0; $i < count($matches[0]); $i++) {
346:             $currentOffset = substr_count($matches[1][$i], '&gt;');
347:             if ($currentOffset > 0) {
348:                 if ($currentOffset > $offset) {
349:                     $html .= str_repeat('<blockquote type="cite">', $currentOffset - $offset) . '<p>';
350:                     $offset = $currentOffset;
351:                 } elseif ($currentOffset < $offset) {
352:                     $html .= '</p>' . str_repeat('</blockquote>', $offset - $currentOffset) . '<p>';
353:                     $offset = $currentOffset;
354:                 }
355: 
356:                 $html .= $matches[2][$i];
357:             } else {
358:                 if ($offset > 0) {
359:                     $html .= '</p>' . str_repeat('</blockquote>', $offset) . '<p>';
360:                     $offset = 0;
361:                 }
362: 
363:                 $html .= $matches[0][$i];
364:             }
365:         }
366:         if ($offset > 0) {
367:             $html .= '</p>' . str_repeat('</blockquote>', $offset);
368:         }
369: 
370:         // Removes empty lines that were created during previous processing
371:         $html = preg_replace('~(?:<br />)+</p></blockquote>~i', '</p></blockquote>', $html);
372:         $html = str_ireplace('<p><br /></p>', '', $html);
373:         $html = str_ireplace('<p><br />', '<p>', $html);
374: 
375:         // Emails and urls
376:         if ($convertLinks) {
377:             $html = self::linkFromText($html);
378:         }
379: 
380:         return $html;
381:     }
382: 
383:     /**
384:      * Converts text to a link to an url or email.
385:      *
386:      * @param string $text Input text
387:      * @return string
388:      */
389:     public static function linkFromText($text)
390:     {
391:         $patternGenericTld = '(?:tld|aero|biz|cat|com|coop|edu|gov|info|int|jobs|mil|mobi|museum|name|net|org|pro|tel|travel|asia|post|geo)';
392:         $patternTld = '(?-i:' . $patternGenericTld . '|[a-z]{2})';
393:         $patternDomain = '(?:(?:[a-z]|[a-z0-9](?:[-a-z0-9]{0,61}[a-z0-9]))[.])*(?:[a-z0-9](?:[-a-z0-9]{0,61}[a-z0-9])[.]' . $patternTld . ')';
394: 
395:         $pattern8bit = '(?:25[0-5]|2[0-4][0-9]|[0-1]?[0-9]?[0-9])';
396:         $patternIPv4 = '(?:' . $pattern8bit . '(?:[.]' . $pattern8bit . '){3})';
397: 
398:         // a:b:c:d:e:f:g:h
399:         $patternIpV6Variant8Hex = '(?:(?:[0-9a-f]{1,4}:){7}[0-9a-f]{1,4})';
400:         // Compressed a::b
401:         $patternIpV6VariantCompressedHex = '(?:(?:(?:[0-9a-f]{1,4}(?::[0-9a-f]{1,4})*)?)::(?:(?:[0-9a-f]{1,4}(?::[0-9a-f]{1,4})*)?))';
402:         // IPv4 mapped to  IPv6 a:b:c:d:e:f:w.x.y.z
403:         $patternIpV6VariantHex4Dec = '(?:(?:(?:[0-9a-f]{1,4}:){6})' . $patternIPv4 . ')';
404:         // Compressed IPv4 mapped to IPv6 a::b:w.x.y.z
405:         $patternIpV6VariantCompressedHex4Dec = '(?:(?:(?:[0-9a-f]{1,4}(?::[0-9a-f]{1,4})*)?)::(?:(?:[0-9a-f]{1,4}:)*)' . $patternIPv4 . ')';
406:         $patternIpV6 = '(?:' . $patternIpV6Variant8Hex . '|' . $patternIpV6VariantCompressedHex . '|' . $patternIpV6VariantHex4Dec . '|' . $patternIpV6VariantCompressedHex4Dec . ')';
407: 
408:         // mailto:username
409:         $patternEmail = '(?:mailto:)?(?:[-\w!#\$%&\'*+/=?^`{|}\~]+(?:[.][-\w!#\$%&\'*+/=?^`{|}\~]+)*)';
410:         // @domain.tld
411:         $patternEmail .= '(?:@' . $patternDomain . ')';
412: 
413:         // protocol://user:password@
414:         $patternUrl = '(?:(?:http|ftp)s?://(?:[\S]+(?:[:][\S]*)?@)?)?';
415:         // domain.tld, IPv4 or IPv6
416:         $patternUrl .= '(?:' . $patternDomain . '|' . $patternIPv4 . '|' . $patternIpV6 . ')';
417:         // :port/path/file.extension
418:         $patternUrl .= '(?::[0-9]+)?(?:(?:/[-\w\pL\pN\~.:!%]+)*(?:/|[.][a-z0-9]{2,4})?)?';
419:         // ?query#hash
420:         $patternUrl .= '(?:[?][\]\[-\w\pL\pN.,?!\~%#@&;:/\'\=+]*)?(?:#[\]\[-\w\pL\pN.,?!\~%@&;:/\'\=+]*)?';
421: 
422:         return preg_replace_callback('~(^|[^\pL\pN])(?:(' . $patternEmail . ')|(' . $patternUrl . '))(?=$|\W)~iu', function($matches) {
423:             // Url
424:             if (isset($matches[3])) {
425:                 $url = $matches[3];
426:                 // Remove special chars at the end
427:                 if (preg_match('~(([.,:;?!>)\]}]|(&gt;))+)$~i', $url, $matches2)) {
428:                     $punctuation = $matches2[1];
429:                     // strlen is necessary because of &gt;
430:                     $url = mb_substr($url, 0, -strlen($matches2[1]), 'utf-8');
431:                 } else {
432:                     $punctuation = '';
433:                 }
434: 
435:                 // Add missing http://
436:                 $linkUrl = !preg_match('~^(http|ftp)s?://~i', $url) ? 'http://' .  $url : $url;
437: 
438:                 // Create a link
439:                 return $matches[1] . '<a href="' . $linkUrl . '">' . $url . '</a>' . $punctuation;
440:             }
441: 
442:             // Emails
443:             if (isset($matches[2])) {
444:                 $email = $matches[2];
445:                 if (false !== stripos($email, 'mailto:')) {
446:                     $email = substr($matches[2], 7);
447:                     $protocol = 'mailto:';
448:                 } else {
449:                     $protocol = '';
450:                 }
451:                 return $matches[1] . '<a href="mailto:' . $email . '">' . $protocol . $email . '</a>';
452:             }
453:         }, $text);
454:     }
455: 
456:     /**
457:      * Converts HTML source code to plaintext.
458:      *
459:      * @param string $html HTML source code
460:      * @return string
461:      */
462:     public static function toText($html)
463:     {
464:         $text = $html;
465: 
466:         // Re-format lines
467:         // <pre>
468:         $text = preg_replace_callback('~<pre[^>]*>(.+?)</pre>~is', function($matches) {
469:             // Line breaks are converted to <br />, that are removed later
470:             return nl2br($matches[1]);
471:         }, $text);
472:         // \r, redundant line breaks, tabs and <br />
473:         $text = preg_replace(
474:             array("~\r~", "~[\n\t]+~", '~<br[^>]*>~i'),
475:             array('', ' ', "\n"),
476:             $text
477:         );
478: 
479:         // Processing of most tags and entities
480:         static $search = array(
481:             '~<h[3-6][^>]*>(.+?)</h[3-6]>~is',  // <h3> to <h6>
482:             '~(<div[^>]*>)|(</div>)~i',         // <div> and </div>
483:             '~(<p(?:\s+[^>]+)?>)|(</p>)~i',     // <p> and </p>
484:             '~(<table[^>]*>)|(</table>)~i',     // <table> and </table>
485:             '~</tr>*~i',                        // </tr>
486:             '~<td[^>]*>(.+?)</td>~is',          // <td> and </td>
487:             // '~(<code[^>]*>)|(</code>)~i',    // <code> and </code>
488:             '~(&hellip;)~i',                    // Ellipsis
489:             '~(&#8220;)|(&#8221;)~i',           // Quotes
490:             '~(&apos;)~i',                      // Apostrophe
491:             '~(&copy;)|(&#169;)~i',             // Copyright
492:             '~&trade;~i',                       // Trademark
493:             '~&reg;~i',                         // Registered trademark
494:             '~(&mdash;)|(&ndash;)~i'            // Dash and hyphen
495:         );
496:         static $replace = array(
497:             "\n\n\\1\n\n",  // <h3> to <h6>
498:             "\n\n",         // <div> and </div>
499:             "\n\n",         // <p> and </p>
500:             "\n\n",         // <table> and </table>
501:             "\n",           // </tr>
502:             "\\1\t",        // <td> and </td>
503:             // "\n\n",      // <code> and </code>
504:             '...',          // Ellipsis
505:             '"',            // Quotes
506:             '\'',           // Apostrophe
507:             '(c)',          // Copyright
508:             '(tm)',         // Trademark
509:             '(R)',          // Registered trademark
510:             '-'             // Dash and hyphen
511:         );
512:         $text = preg_replace($search, $replace, $text);
513: 
514:         // <h1> and <h2>
515:         $text = preg_replace_callback('~<h[12][^>]*>(.+?)</h[12]>~is', function($matches) {
516:             return "\n\n\n" . mb_strtoupper($matches[1], 'utf-8') . "\n\n";
517:         }, $text);
518:         // <strong>
519:         $text = preg_replace_callback('~<strong[^>]*>(.+?)</strong>~is', function($matches) {
520:             return mb_strtoupper($matches[1], 'utf-8');
521:         }, $text);
522:         // <hr />
523:         $text = preg_replace_callback('~<hr[^>]*>~i', function($matches) {
524:             return "\n" . str_repeat('-', 50) . "\n";
525:         }, $text);
526:         // <th>
527:         $text = preg_replace_callback('~<th[^>]*>(.+?)</th>~is', function($matches) {
528:             return mb_strtoupper($matches[1], 'utf-8') . "\t";
529:         }, $text);
530:         // <a>
531:         $text = self::linkToText($text);
532:         // <ul> and <ol>
533:         $text = self::listToText($text);
534: 
535:         // Two empty lines at most
536:         $text = trim($text, "\n ");
537:         $text = preg_replace("~\n\s+\n~", "\n\n", $text);
538: 
539:         // Process <blockquote> (empty lines are removed before <blockquote> processing on purpose)
540:         $text = self::blockquoteToText($text);
541: 
542:         // Remove all left tags
543:         $text = strip_tags($text);
544: 
545:         // Replacing [textlink] for <> (must be done after strip_tags)
546:         $text = preg_replace('~\[textlink\]\s*~s', '<', $text);
547:         $text = preg_replace('~\s*\[/textlink\]~s', '>', $text);
548: 
549:         // Replaces non-breaking spaces
550:         $text = preg_replace(array('~&nbsp;&nbsp;&nbsp;&nbsp;~i', '~&nbsp;~i'), array("\t", ' '), $text);
551: 
552:         // Remove other entities (must not be performed before)
553:         // After previous processing some entities are upper case, that is why we have to use strtolower
554:         $text = preg_replace_callback('~(&#?[a-z0-9]+;)~i', function($matches) {
555:             return html_entity_decode(strtolower($matches[1]), ENT_QUOTES, 'utf-8');
556:         }, $text);
557: 
558:         // Two empty lines at most (performed second times on purpose)
559:         $text = trim($text, "\n ");
560:         $text = preg_replace("~\n\s+\n~", "\n\n", $text);
561:         // Because of <blockquote> converting
562:         $text = preg_replace("~(\n>\s*)+\n~", "\n>\n", $text);
563: 
564:         // One space at most
565:         $text = preg_replace("~(\n|\t)( )+~", '\1', $text);
566:         $text = preg_replace('~( ){2,}~', ' ', $text);
567: 
568:         // No space at line ends
569:         $text = preg_replace("~[ \t]+\n~", "\n", $text);
570: 
571:         return $text;
572:     }
573: 
574:     /**
575:      * Converts HTML links into plaintext.
576:      *
577:      * @param string $text Text with HTML fragments
578:      * @return string
579:      */
580:     private static function linkToText($text)
581:     {
582:         return preg_replace_callback('~<a\s+(?:[^>]+\s+)*href\s*=\s*"([^"]+)"(?:\s+[^>]*)?>(.+?)</a>~is', function($matches) {
583:             $url = trim($matches[1]);
584:             $content = $matches[2];
585:             $clearContent = trim(strip_tags($content));
586: 
587:             // Some urls have no real meaning
588:             if ((empty($url)) || ('#' === $url[0]) || ('/?' === substr($url, 0, 2))) {
589:                 return $content;
590:             }
591: 
592:             // Invalid url gets ignored
593:             if (!Input\Validator\IsUrl::validate($url)) {
594:                 return $content;
595:             }
596: 
597:             // If the link text and target are the same, use only one of them
598:             if ($url === $clearContent) {
599:                 return '[textlink]' . $content . '[/textlink]';
600:             } else {
601:                 return $content . ' [textlink]' . $url . '[/textlink]';
602:             }
603:         }, $text);
604:     }
605: 
606:     /**
607:      * Converts HTML lists to plaintext.
608:      *
609:      * @param string $text Text with HTML fragments
610:      * @return string
611:      */
612:     private static function listToText($text)
613:     {
614:         static $symbols = array('#', '*', 'o', '+');
615: 
616:         preg_match_all('~(?:<[a-z][a-z0-9]*[^>]*(?: /)?>)|(?:</[a-z][a-z0-9]*>)|(?:<![^>]+>)|(?:[^<]+)~i', $text, $matches);
617:         $text = '';
618:         $ulLevel = 0;
619:         $olLevel = 0;
620:         $olLiCount = array();
621:         $path = array();
622: 
623:         foreach ($matches[0] as $textPart) {
624:             if (0 === stripos($textPart, '<ol')) {
625:                 array_push($path, 'ol');
626:                 $olLevel++;
627:                 $olLiCount[$olLevel] = 1;
628:                 $textPart = "\n\n";
629:             } elseif ('</ol>' === strtolower($textPart)) {
630:                 array_pop($path);
631:                 $olLevel--;
632:                 $textPart = "\n\n";
633:             } elseif (0 === stripos($textPart, '<ul')) {
634:                 array_push($path, 'ul');
635:                 $ulLevel++;
636:                 $textPart = "\n\n";
637:             } elseif ('</ul>' === strtolower($textPart)) {
638:                 array_pop($path);
639:                 $ulLevel--;
640:                 $textPart = "\n\n";
641:             } elseif (0 === stripos($textPart, '<li')) {
642:                 $textPart = str_repeat("\t", $olLevel + $ulLevel);
643:                 if ('ul' === end($path)) {
644:                     $textPart .= $symbols[$ulLevel % 4] . ' ';
645:                 } elseif ('ol' === end($path)) {
646:                     $textPart .= $olLiCount[$olLevel] . '. ';
647:                     $olLiCount[$olLevel]++;
648:                 }
649:             } elseif ('</li>' === strtolower($textPart)) {
650:                 $textPart = "\n";
651:             }
652: 
653:             $text .= $textPart;
654:         }
655: 
656:         return $text;
657:     }
658: 
659:     /**
660:      * Converts citations into plaintext.
661:      *
662:      * @param string $text Text with HTML fragments
663:      * @return string
664:      */
665:     private static function blockquoteToText($text)
666:     {
667:         if (preg_match_all('~(?:<blockquote[^>]*>\s*)|(?:\s*</blockquote>)|(?:.+?(?=</?blockquote)|(?:.+))~is', $text, $matches) > 0) {
668:             $text = '';
669:             $offset = 0;
670:             foreach ($matches[0] as $textPart) {
671:                 if (($currentOffset = substr_count(strtolower($textPart), '<blockquote')) > 0) {
672:                     $offset += $currentOffset;
673:                     $textPart = ($offset == 1 ? "\n" : ''); // Adds a line to the beginning
674:                 } elseif (($currentOffset = substr_count(strtolower($textPart), '</blockquote>')) > 0) {
675:                     $offset -= $currentOffset;
676:                     $textPart = '';
677:                 } elseif ($offset > 0) {
678:                     $textPart = "\n" . str_repeat('>', $offset) . ' '   // Opening tag
679:                         . str_replace("\n", "\n" . str_repeat('>', $offset) . ' ', trim($textPart)) // Beginning of all lines
680:                         . "\n" . str_repeat('>', $offset);  // Closing tag
681:                 }
682: 
683:                 $text .= $textPart;
684:             }
685:         }
686: 
687:         return $text;
688:     }
689: }
690:
Namespaces

Classes

Exceptions