1: <?php
2:
3: 4: 5: 6: 7: 8: 9: 10: 11: 12:
13:
14: namespace Jyxo;
15:
16: 17: 18: 19: 20: 21: 22: 23: 24:
25: class Html
26: {
27: 28: 29: 30: 31:
32: public final function __construct()
33: {
34: throw new \LogicException(sprintf('Cannot create an instance of a static class %s.', get_class($this)));
35: }
36:
37: 38: 39: 40: 41: 42: 43:
44: public static function is($text)
45: {
46: return (bool) preg_match('~<[a-z][a-z0-9]*(\s[^<]*)?>~i', $text);
47: }
48:
49: 50: 51: 52: 53: 54: 55:
56: public static function repair($html)
57: {
58:
59: static $config = array(
60: 'newline' => 'LF',
61: 'indent' => false,
62: 'output-xhtml' => true,
63: 'output-bom' => false,
64: 'doctype' => 'auto',
65:
66: 'bare' => true,
67: 'wrap' => 0,
68: 'wrap-sections' => false,
69:
70:
71: 'enclose-text' => true,
72: 'merge-divs' => false,
73: 'merge-spans' => false,
74:
75: 'force-output' => true,
76: 'show-errors' => 0,
77: 'show-warnings' => false,
78: 'escape-cdata' => true,
79: 'preserve-entities' => true
80:
81:
82: );
83: $html = tidy_repair_string($html, $config, 'utf8');
84:
85:
86: $html = preg_replace('~<\?xml:namespace[^>]*>~i', '', $html);
87:
88:
89:
90: $html = preg_replace("~(<pre[^>]*>)\n~", '\1', $html);
91: $html = preg_replace("~\n</pre>~", '</pre>', $html);
92: $html = preg_replace_callback('~(<pre[^>]*>)(.+?)(</pre>)~s', function($matches) {
93: return $matches[1] . strtr(nl2br($matches[2]), array('\"' => '"')) . $matches[3];
94: }, $html);
95:
96: $html = strtr($html, array("\r" => '', "\n" => ''));
97:
98:
99: $html = preg_replace('~(<[a-z][a-z0-9]*[^>]+[a-z]+=)\'([^\']*)\'~i', '\1"\2"', $html);
100:
101:
102: $html = preg_replace('~(<[a-z][a-z0-9]*[^>]+[a-z]+=")\s+([^"]*")~i', '\1\2', $html);
103: $html = preg_replace('~(<[a-z][a-z0-9]*[^>]+[a-z]+="[^"]*)\s+(")~i', '\1\2', $html);
104:
105: return $html;
106: }
107:
108: 109: 110: 111: 112: 113: 114: 115: 116:
117: public static function removeTags($html, array $tags = array())
118: {
119:
120: static $default = array(
121: 'frameset', 'frame', 'noframes', 'iframe', 'script', 'noscript', 'style', 'link',
122: 'object', 'embed', 'form', 'input', 'select', 'textarea', 'button'
123: );
124:
125:
126: if (empty($tags)) {
127: $tags = $default;
128: }
129:
130:
131: foreach ($tags as $tag) {
132: switch ($tag) {
133:
134: case 'embed':
135:
136: $pattern = array('~\s*<embed[^>]*>.*?</embed>~is', '~\s*<embed[^>]*>~is');
137: break;
138:
139: case 'link':
140: case 'meta':
141: case 'br':
142: case 'hr':
143: case 'img':
144: case 'input':
145: $pattern = array('~\s*<' . $tag . '[^>]*>~is');
146: break;
147:
148: default:
149: $pattern = array('~\s*<' . $tag . '(?:\s+[^>]*)?>.*?</' . $tag . '>~is');
150: break;
151: }
152:
153: $html = preg_replace($pattern, '', $html);
154: }
155:
156: return $html;
157: }
158:
159: 160: 161: 162: 163: 164: 165: 166:
167: public static function removeInnerTags($html, $tag)
168: {
169: $tag = (string) $tag;
170:
171: if (preg_match_all('~(?:<' . $tag . '>)|(?:</' . $tag . '>)|(?:<[^>]+>)|(?:[^<]+)~i', $html, $matches)) {
172: $html = '';
173: $level = 0;
174: foreach ($matches[0] as $htmlPart) {
175: if (0 === stripos($htmlPart, '<' . $tag)) {
176: $level++;
177: if (1 === $level) {
178: $html .= $htmlPart;
179: }
180: } elseif (0 === stripos($htmlPart, '</' . $tag)) {
181: if (1 === $level) {
182: $html .= $htmlPart;
183: }
184: $level--;
185: } else {
186: $html .= $htmlPart;
187: }
188: }
189: }
190:
191: return $html;
192: }
193:
194: 195: 196: 197: 198: 199: 200: 201: 202:
203: public static function removeAttributes($html, array $attributes = array())
204: {
205:
206: static $default = array('id', 'class');
207:
208:
209: if (empty($attributes)) {
210: $attributes = $default;
211: }
212:
213:
214: foreach ($attributes as $attribute) {
215: $html = preg_replace('~(<[a-z][a-z0-9]*[^>]*?)\s+' . $attribute . '="[^"]*"~is', '\1', $html);
216: }
217:
218: return $html;
219: }
220:
221: 222: 223: 224: 225: 226: 227: 228:
229: public static function removeJavascriptEvents($html)
230: {
231:
232: while (preg_match('~<[a-z][a-z0-9]*[^>]*?\s+on[a-z]+="[^"]*"~is', $html)) {
233: $html = preg_replace('~(<[a-z][a-z0-9]*[^>]*?)\s+on[a-z]+="[^"]*"~is', '\1', $html);
234: }
235: return $html;
236: }
237:
238: 239: 240: 241: 242: 243: 244: 245: 246:
247: public static function removeRemoteImages($html)
248: {
249: static $remoteImages = array(
250: '~(<img[^>]+src=")http(?:s)?://[^"]+(")~i',
251: '~(<[a-z][a-z0-9]*[^>]+background=")http(?:s)?://[^"]+(")~i',
252: '~(<[a-z][a-z0-9]*[^>]+style="[^"]*background\s*[:])([-a-z0-9#%\s]*)url\([^)]+\)(;)?~is',
253: '~(<[a-z][a-z0-9]*[^>]+style="[^"]*)background-image\s*[:]([-a-z0-9#%\s]*)url\([^)]+\)(;)?~is',
254: '~(<[a-z][a-z0-9]*[^>]+style="[^"]*list-style\s*[:])([-a-z0-9\s]*)url\([^)]+\)(;)?~is',
255: '~(<[a-z][a-z0-9]*[^>]+style="[^"]*)list-style-image\s*[:]([-a-z0-9\s]*)url\([^)]+\)(;)?~is'
256: );
257:
258: static $remoteImagesReplacement = array(
259: '\1about:blank\2',
260: '\1\2',
261: '\1\2\3',
262: '\1',
263: '\1\2\3',
264: '\1'
265: );
266:
267: return preg_replace($remoteImages, $remoteImagesReplacement, $html);
268: }
269:
270: 271: 272: 273: 274: 275:
276: public static function removeDangerous($html)
277: {
278: static $dangerous = array(
279: '~\s+href="javascript[^"]*"~i',
280: '~\s+src="javascript[^"]*"~i',
281: '~\s+href="data:[^"]*"~i',
282: '~\s+src="data:[^"]*"~i'
283: );
284:
285: return preg_replace($dangerous, '', $html);
286: }
287:
288: 289: 290: 291: 292: 293: 294:
295: public static function getBody($html)
296: {
297:
298: if (preg_match('~<body([^>]*)>(.*?)</body>~is', $html, $matches)) {
299: $body = trim($matches[2]);
300:
301:
302: if (preg_match('~style="[^"]+"~i', $matches[1], $style)) {
303: $body = '<div ' . $style[0] . '>' . $body . '</div>';
304: }
305:
306: return $body;
307: }
308:
309:
310: return $html;
311: }
312:
313: 314: 315: 316: 317: 318: 319:
320: public static function fromText($text, $convertLinks = true)
321: {
322:
323: $text = trim($text, "\r\n");
324:
325:
326: $text = preg_replace("~\n\s+\n~", "\n\n", $text);
327:
328:
329: $html = htmlspecialchars($text, ENT_QUOTES, 'utf-8', false);
330:
331:
332: $html = str_replace(' ', ' ', $html);
333:
334: $html = str_replace("\t", ' ', $html);
335:
336:
337: $html = '<p>' . preg_replace("~\n\n[^\\n]?~", '</p><p>\0', $html) . '</p>';
338: $html = str_replace("\n", "<br />\n", $html);
339: $html = str_ireplace('<p><br />', "<p>\n", $html);
340:
341:
342: preg_match_all('~(?:(^(?:<p>)?\s*>(?:>|\s)*)(.*)$)|(?:.+)~im', $html, $matches);
343: $html = '';
344: $offset = 0;
345: for ($i = 0; $i < count($matches[0]); $i++) {
346: $currentOffset = substr_count($matches[1][$i], '>');
347: if ($currentOffset > 0) {
348: if ($currentOffset > $offset) {
349: $html .= str_repeat('<blockquote type="cite">', $currentOffset - $offset) . '<p>';
350: $offset = $currentOffset;
351: } elseif ($currentOffset < $offset) {
352: $html .= '</p>' . str_repeat('</blockquote>', $offset - $currentOffset) . '<p>';
353: $offset = $currentOffset;
354: }
355:
356: $html .= $matches[2][$i];
357: } else {
358: if ($offset > 0) {
359: $html .= '</p>' . str_repeat('</blockquote>', $offset) . '<p>';
360: $offset = 0;
361: }
362:
363: $html .= $matches[0][$i];
364: }
365: }
366: if ($offset > 0) {
367: $html .= '</p>' . str_repeat('</blockquote>', $offset);
368: }
369:
370:
371: $html = preg_replace('~(?:<br />)+</p></blockquote>~i', '</p></blockquote>', $html);
372: $html = str_ireplace('<p><br /></p>', '', $html);
373: $html = str_ireplace('<p><br />', '<p>', $html);
374:
375:
376: if ($convertLinks) {
377: $html = self::linkFromText($html);
378: }
379:
380: return $html;
381: }
382:
383: 384: 385: 386: 387: 388:
389: public static function linkFromText($text)
390: {
391: $patternGenericTld = '(?:tld|aero|biz|cat|com|coop|edu|gov|info|int|jobs|mil|mobi|museum|name|net|org|pro|tel|travel|asia|post|geo)';
392: $patternTld = '(?-i:' . $patternGenericTld . '|[a-z]{2})';
393: $patternDomain = '(?:(?:[a-z]|[a-z0-9](?:[-a-z0-9]{0,61}[a-z0-9]))[.])*(?:[a-z0-9](?:[-a-z0-9]{0,61}[a-z0-9])[.]' . $patternTld . ')';
394:
395: $pattern8bit = '(?:25[0-5]|2[0-4][0-9]|[0-1]?[0-9]?[0-9])';
396: $patternIPv4 = '(?:' . $pattern8bit . '(?:[.]' . $pattern8bit . '){3})';
397:
398:
399: $patternIpV6Variant8Hex = '(?:(?:[0-9a-f]{1,4}:){7}[0-9a-f]{1,4})';
400:
401: $patternIpV6VariantCompressedHex = '(?:(?:(?:[0-9a-f]{1,4}(?::[0-9a-f]{1,4})*)?)::(?:(?:[0-9a-f]{1,4}(?::[0-9a-f]{1,4})*)?))';
402:
403: $patternIpV6VariantHex4Dec = '(?:(?:(?:[0-9a-f]{1,4}:){6})' . $patternIPv4 . ')';
404:
405: $patternIpV6VariantCompressedHex4Dec = '(?:(?:(?:[0-9a-f]{1,4}(?::[0-9a-f]{1,4})*)?)::(?:(?:[0-9a-f]{1,4}:)*)' . $patternIPv4 . ')';
406: $patternIpV6 = '(?:' . $patternIpV6Variant8Hex . '|' . $patternIpV6VariantCompressedHex . '|' . $patternIpV6VariantHex4Dec . '|' . $patternIpV6VariantCompressedHex4Dec . ')';
407:
408:
409: $patternEmail = '(?:mailto:)?(?:[-\w!#\$%&\'*+/=?^`{|}\~]+(?:[.][-\w!#\$%&\'*+/=?^`{|}\~]+)*)';
410:
411: $patternEmail .= '(?:@' . $patternDomain . ')';
412:
413:
414: $patternUrl = '(?:(?:http|ftp)s?://(?:[\S]+(?:[:][\S]*)?@)?)?';
415:
416: $patternUrl .= '(?:' . $patternDomain . '|' . $patternIPv4 . '|' . $patternIpV6 . ')';
417:
418: $patternUrl .= '(?::[0-9]+)?(?:(?:/[-\w\pL\pN\~.:!%]+)*(?:/|[.][a-z0-9]{2,4})?)?';
419:
420: $patternUrl .= '(?:[?][\]\[-\w\pL\pN.,?!\~%#@&;:/\'\=+]*)?(?:#[\]\[-\w\pL\pN.,?!\~%@&;:/\'\=+]*)?';
421:
422: return preg_replace_callback('~(^|[^\pL\pN])(?:(' . $patternEmail . ')|(' . $patternUrl . '))(?=$|\W)~iu', function($matches) {
423:
424: if (isset($matches[3])) {
425: $url = $matches[3];
426:
427: if (preg_match('~(([.,:;?!>)\]}]|(>))+)$~i', $url, $matches2)) {
428: $punctuation = $matches2[1];
429:
430: $url = mb_substr($url, 0, -strlen($matches2[1]), 'utf-8');
431: } else {
432: $punctuation = '';
433: }
434:
435:
436: $linkUrl = !preg_match('~^(http|ftp)s?://~i', $url) ? 'http://' . $url : $url;
437:
438:
439: return $matches[1] . '<a href="' . $linkUrl . '">' . $url . '</a>' . $punctuation;
440: }
441:
442:
443: if (isset($matches[2])) {
444: $email = $matches[2];
445: if (false !== stripos($email, 'mailto:')) {
446: $email = substr($matches[2], 7);
447: $protocol = 'mailto:';
448: } else {
449: $protocol = '';
450: }
451: return $matches[1] . '<a href="mailto:' . $email . '">' . $protocol . $email . '</a>';
452: }
453: }, $text);
454: }
455:
456: 457: 458: 459: 460: 461:
462: public static function toText($html)
463: {
464: $text = $html;
465:
466:
467:
468: $text = preg_replace_callback('~<pre[^>]*>(.+?)</pre>~is', function($matches) {
469:
470: return nl2br($matches[1]);
471: }, $text);
472:
473: $text = preg_replace(
474: array("~\r~", "~[\n\t]+~", '~<br[^>]*>~i'),
475: array('', ' ', "\n"),
476: $text
477: );
478:
479:
480: static $search = array(
481: '~<h[3-6][^>]*>(.+?)</h[3-6]>~is',
482: '~(<div[^>]*>)|(</div>)~i',
483: '~(<p(?:\s+[^>]+)?>)|(</p>)~i',
484: '~(<table[^>]*>)|(</table>)~i',
485: '~</tr>*~i',
486: '~<td[^>]*>(.+?)</td>~is',
487:
488: '~(…)~i',
489: '~(“)|(”)~i',
490: '~(')~i',
491: '~(©)|(©)~i',
492: '~™~i',
493: '~®~i',
494: '~(—)|(–)~i'
495: );
496: static $replace = array(
497: "\n\n\\1\n\n",
498: "\n\n",
499: "\n\n",
500: "\n\n",
501: "\n",
502: "\\1\t",
503:
504: '...',
505: '"',
506: '\'',
507: '(c)',
508: '(tm)',
509: '(R)',
510: '-'
511: );
512: $text = preg_replace($search, $replace, $text);
513:
514:
515: $text = preg_replace_callback('~<h[12][^>]*>(.+?)</h[12]>~is', function($matches) {
516: return "\n\n\n" . mb_strtoupper($matches[1], 'utf-8') . "\n\n";
517: }, $text);
518:
519: $text = preg_replace_callback('~<strong[^>]*>(.+?)</strong>~is', function($matches) {
520: return mb_strtoupper($matches[1], 'utf-8');
521: }, $text);
522:
523: $text = preg_replace_callback('~<hr[^>]*>~i', function($matches) {
524: return "\n" . str_repeat('-', 50) . "\n";
525: }, $text);
526:
527: $text = preg_replace_callback('~<th[^>]*>(.+?)</th>~is', function($matches) {
528: return mb_strtoupper($matches[1], 'utf-8') . "\t";
529: }, $text);
530:
531: $text = self::linkToText($text);
532:
533: $text = self::listToText($text);
534:
535:
536: $text = trim($text, "\n ");
537: $text = preg_replace("~\n\s+\n~", "\n\n", $text);
538:
539:
540: $text = self::blockquoteToText($text);
541:
542:
543: $text = strip_tags($text);
544:
545:
546: $text = preg_replace('~\[textlink\]\s*~s', '<', $text);
547: $text = preg_replace('~\s*\[/textlink\]~s', '>', $text);
548:
549:
550: $text = preg_replace(array('~ ~i', '~ ~i'), array("\t", ' '), $text);
551:
552:
553:
554: $text = preg_replace_callback('~(&#?[a-z0-9]+;)~i', function($matches) {
555: return html_entity_decode(strtolower($matches[1]), ENT_QUOTES, 'utf-8');
556: }, $text);
557:
558:
559: $text = trim($text, "\n ");
560: $text = preg_replace("~\n\s+\n~", "\n\n", $text);
561:
562: $text = preg_replace("~(\n>\s*)+\n~", "\n>\n", $text);
563:
564:
565: $text = preg_replace("~(\n|\t)( )+~", '\1', $text);
566: $text = preg_replace('~( ){2,}~', ' ', $text);
567:
568:
569: $text = preg_replace("~[ \t]+\n~", "\n", $text);
570:
571: return $text;
572: }
573:
574: 575: 576: 577: 578: 579:
580: private static function linkToText($text)
581: {
582: return preg_replace_callback('~<a\s+(?:[^>]+\s+)*href\s*=\s*"([^"]+)"(?:\s+[^>]*)?>(.+?)</a>~is', function($matches) {
583: $url = trim($matches[1]);
584: $content = $matches[2];
585: $clearContent = trim(strip_tags($content));
586:
587:
588: if ((empty($url)) || ('#' === $url[0]) || ('/?' === substr($url, 0, 2))) {
589: return $content;
590: }
591:
592:
593: if (!Input\Validator\IsUrl::validate($url)) {
594: return $content;
595: }
596:
597:
598: if ($url === $clearContent) {
599: return '[textlink]' . $content . '[/textlink]';
600: } else {
601: return $content . ' [textlink]' . $url . '[/textlink]';
602: }
603: }, $text);
604: }
605:
606: 607: 608: 609: 610: 611:
612: private static function listToText($text)
613: {
614: static $symbols = array('#', '*', 'o', '+');
615:
616: preg_match_all('~(?:<[a-z][a-z0-9]*[^>]*(?: /)?>)|(?:</[a-z][a-z0-9]*>)|(?:<![^>]+>)|(?:[^<]+)~i', $text, $matches);
617: $text = '';
618: $ulLevel = 0;
619: $olLevel = 0;
620: $olLiCount = array();
621: $path = array();
622:
623: foreach ($matches[0] as $textPart) {
624: if (0 === stripos($textPart, '<ol')) {
625: array_push($path, 'ol');
626: $olLevel++;
627: $olLiCount[$olLevel] = 1;
628: $textPart = "\n\n";
629: } elseif ('</ol>' === strtolower($textPart)) {
630: array_pop($path);
631: $olLevel--;
632: $textPart = "\n\n";
633: } elseif (0 === stripos($textPart, '<ul')) {
634: array_push($path, 'ul');
635: $ulLevel++;
636: $textPart = "\n\n";
637: } elseif ('</ul>' === strtolower($textPart)) {
638: array_pop($path);
639: $ulLevel--;
640: $textPart = "\n\n";
641: } elseif (0 === stripos($textPart, '<li')) {
642: $textPart = str_repeat("\t", $olLevel + $ulLevel);
643: if ('ul' === end($path)) {
644: $textPart .= $symbols[$ulLevel % 4] . ' ';
645: } elseif ('ol' === end($path)) {
646: $textPart .= $olLiCount[$olLevel] . '. ';
647: $olLiCount[$olLevel]++;
648: }
649: } elseif ('</li>' === strtolower($textPart)) {
650: $textPart = "\n";
651: }
652:
653: $text .= $textPart;
654: }
655:
656: return $text;
657: }
658:
659: 660: 661: 662: 663: 664:
665: private static function blockquoteToText($text)
666: {
667: if (preg_match_all('~(?:<blockquote[^>]*>\s*)|(?:\s*</blockquote>)|(?:.+?(?=</?blockquote)|(?:.+))~is', $text, $matches) > 0) {
668: $text = '';
669: $offset = 0;
670: foreach ($matches[0] as $textPart) {
671: if (($currentOffset = substr_count(strtolower($textPart), '<blockquote')) > 0) {
672: $offset += $currentOffset;
673: $textPart = ($offset == 1 ? "\n" : '');
674: } elseif (($currentOffset = substr_count(strtolower($textPart), '</blockquote>')) > 0) {
675: $offset -= $currentOffset;
676: $textPart = '';
677: } elseif ($offset > 0) {
678: $textPart = "\n" . str_repeat('>', $offset) . ' '
679: . str_replace("\n", "\n" . str_repeat('>', $offset) . ' ', trim($textPart))
680: . "\n" . str_repeat('>', $offset);
681: }
682:
683: $text .= $textPart;
684: }
685: }
686:
687: return $text;
688: }
689: }
690: