1: <?php
2:
3: 4: 5: 6: 7: 8: 9: 10: 11: 12:
13:
14: 15: 16: 17: 18: 19: 20: 21: 22:
23: class Jyxo_Html
24: {
25: 26: 27: 28: 29:
30: public final function __construct()
31: {
32: throw new LogicException(sprintf('Cannot create an instance of a static class %s.', get_class($this)));
33: }
34:
35: 36: 37: 38: 39: 40: 41:
42: public static function is($text)
43: {
44: return (bool) preg_match('~<[a-z][a-z0-9]*(\s[^<]*)?>~i', $text);
45: }
46:
47: 48: 49: 50: 51: 52: 53:
54: public static function repair($html)
55: {
56:
57: static $config = array(
58: 'newline' => 'LF',
59: 'indent' => false,
60: 'output-xhtml' => true,
61: 'output-bom' => false,
62: 'doctype' => 'auto',
63:
64: 'bare' => true,
65: 'wrap' => 0,
66: 'wrap-sections' => false,
67:
68:
69: 'enclose-text' => true,
70: 'merge-divs' => false,
71: 'merge-spans' => false,
72:
73: 'force-output' => true,
74: 'show-errors' => 0,
75: 'show-warnings' => false,
76: 'escape-cdata' => true,
77: 'preserve-entities' => true
78:
79:
80: );
81: $html = tidy_repair_string($html, $config, 'utf8');
82:
83:
84: $html = preg_replace('~<\?xml:namespace[^>]*>~i', '', $html);
85:
86:
87:
88: $html = preg_replace("~(<pre[^>]*>)\n~", '\1', $html);
89: $html = preg_replace("~\n</pre>~", '</pre>', $html);
90: $html = preg_replace_callback('~(<pre[^>]*>)(.+?)(</pre>)~s', function($matches) {
91: return $matches[1] . strtr(nl2br($matches[2]), array('\"' => '"')) . $matches[3];
92: }, $html);
93:
94: $html = strtr($html, array("\r" => '', "\n" => ''));
95:
96:
97: $html = preg_replace('~(<[a-z][a-z0-9]*[^>]+[a-z]+=)\'([^\']*)\'~i', '\1"\2"', $html);
98:
99:
100: $html = preg_replace('~(<[a-z][a-z0-9]*[^>]+[a-z]+=")\s+([^"]*")~i', '\1\2', $html);
101: $html = preg_replace('~(<[a-z][a-z0-9]*[^>]+[a-z]+="[^"]*)\s+(")~i', '\1\2', $html);
102:
103: return $html;
104: }
105:
106: 107: 108: 109: 110: 111: 112: 113: 114:
115: public static function removeTags($html, array $tags = array())
116: {
117:
118: static $default = array(
119: 'frameset', 'frame', 'noframes', 'iframe', 'script', 'noscript', 'style', 'link',
120: 'object', 'embed', 'form', 'input', 'select', 'textarea', 'button'
121: );
122:
123:
124: if (empty($tags)) {
125: $tags = $default;
126: }
127:
128:
129: foreach ($tags as $tag) {
130: switch ($tag) {
131:
132: case 'embed':
133:
134: $pattern = array('~\s*<embed[^>]*>.*?</embed>~is', '~\s*<embed[^>]*>~is');
135: break;
136:
137: case 'link':
138: case 'meta':
139: case 'br':
140: case 'hr':
141: case 'img':
142: case 'input':
143: $pattern = array('~\s*<' . $tag . '[^>]*>~is');
144: break;
145:
146: default:
147: $pattern = array('~\s*<' . $tag . '(?:\s+[^>]*)?>.*?</' . $tag . '>~is');
148: break;
149: }
150:
151: $html = preg_replace($pattern, '', $html);
152: }
153:
154: return $html;
155: }
156:
157: 158: 159: 160: 161: 162: 163: 164:
165: public static function removeInnerTags($html, $tag)
166: {
167: $tag = (string) $tag;
168:
169: if (preg_match_all('~(?:<' . $tag . '>)|(?:</' . $tag . '>)|(?:<[^>]+>)|(?:[^<]+)~i', $html, $matches)) {
170: $html = '';
171: $level = 0;
172: foreach ($matches[0] as $htmlPart) {
173: if (0 === stripos($htmlPart, '<' . $tag)) {
174: $level++;
175: if (1 === $level) {
176: $html .= $htmlPart;
177: }
178: } elseif (0 === stripos($htmlPart, '</' . $tag)) {
179: if (1 === $level) {
180: $html .= $htmlPart;
181: }
182: $level--;
183: } else {
184: $html .= $htmlPart;
185: }
186: }
187: }
188:
189: return $html;
190: }
191:
192: 193: 194: 195: 196: 197: 198: 199: 200:
201: public static function removeAttributes($html, array $attributes = array())
202: {
203:
204: static $default = array('id', 'class');
205:
206:
207: if (empty($attributes)) {
208: $attributes = $default;
209: }
210:
211:
212: foreach ($attributes as $attribute) {
213: $html = preg_replace('~(<[a-z][a-z0-9]*[^>]*?)\s+' . $attribute . '="[^"]*"~is', '\1', $html);
214: }
215:
216: return $html;
217: }
218:
219: 220: 221: 222: 223: 224: 225: 226:
227: public static function removeJavascriptEvents($html)
228: {
229:
230: while (preg_match('~<[a-z][a-z0-9]*[^>]*?\s+on[a-z]+="[^"]*"~is', $html)) {
231: $html = preg_replace('~(<[a-z][a-z0-9]*[^>]*?)\s+on[a-z]+="[^"]*"~is', '\1', $html);
232: }
233: return $html;
234: }
235:
236: 237: 238: 239: 240: 241: 242: 243: 244:
245: public static function removeRemoteImages($html)
246: {
247: static $remoteImages = array(
248: '~(<img[^>]+src=")http(?:s)?://[^"]+(")~i',
249: '~(<[a-z][a-z0-9]*[^>]+background=")http(?:s)?://[^"]+(")~i',
250: '~(<[a-z][a-z0-9]*[^>]+style="[^"]*background\s*[:])([-a-z0-9#%\s]*)url\([^)]+\)(;)?~is',
251: '~(<[a-z][a-z0-9]*[^>]+style="[^"]*)background-image\s*[:]([-a-z0-9#%\s]*)url\([^)]+\)(;)?~is',
252: '~(<[a-z][a-z0-9]*[^>]+style="[^"]*list-style\s*[:])([-a-z0-9\s]*)url\([^)]+\)(;)?~is',
253: '~(<[a-z][a-z0-9]*[^>]+style="[^"]*)list-style-image\s*[:]([-a-z0-9\s]*)url\([^)]+\)(;)?~is'
254: );
255:
256: static $remoteImagesReplacement = array(
257: '\1about:blank\2',
258: '\1\2',
259: '\1\2\3',
260: '\1',
261: '\1\2\3',
262: '\1'
263: );
264:
265: return preg_replace($remoteImages, $remoteImagesReplacement, $html);
266: }
267:
268: 269: 270: 271: 272: 273:
274: public static function removeDangerous($html)
275: {
276: static $dangerous = array(
277: '~\s+href="javascript[^"]*"~i',
278: '~\s+src="javascript[^"]*"~i',
279: '~\s+href="data:[^"]*"~i',
280: '~\s+src="data:[^"]*"~i'
281: );
282:
283: return preg_replace($dangerous, '', $html);
284: }
285:
286: 287: 288: 289: 290: 291: 292:
293: public static function getBody($html)
294: {
295:
296: if (preg_match('~<body([^>]*)>(.*?)</body>~is', $html, $matches)) {
297: $body = trim($matches[2]);
298:
299:
300: if (preg_match('~style="[^"]+"~i', $matches[1], $style)) {
301: $body = '<div ' . $style[0] . '>' . $body . '</div>';
302: }
303:
304: return $body;
305: }
306:
307:
308: return $html;
309: }
310:
311: 312: 313: 314: 315: 316: 317:
318: public static function fromText($text, $convertLinks = true)
319: {
320:
321: $text = trim($text, "\r\n");
322:
323:
324: $text = preg_replace("~\n\s+\n~", "\n\n", $text);
325:
326:
327: $html = htmlspecialchars($text, ENT_QUOTES, 'utf-8', false);
328:
329:
330: $html = str_replace(' ', ' ', $html);
331:
332: $html = str_replace("\t", ' ', $html);
333:
334:
335: $html = '<p>' . preg_replace("~\n\n[^\\n]?~", '</p><p>\0', $html) . '</p>';
336: $html = str_replace("\n", "<br />\n", $html);
337: $html = str_ireplace('<p><br />', "<p>\n", $html);
338:
339:
340: preg_match_all('~(?:(^(?:<p>)?\s*>(?:>|\s)*)(.*)$)|(?:.+)~im', $html, $matches);
341: $html = '';
342: $offset = 0;
343: for ($i = 0; $i < count($matches[0]); $i++) {
344: $currentOffset = substr_count($matches[1][$i], '>');
345: if ($currentOffset > 0) {
346: if ($currentOffset > $offset) {
347: $html .= str_repeat('<blockquote type="cite">', $currentOffset - $offset) . '<p>';
348: $offset = $currentOffset;
349: } elseif ($currentOffset < $offset) {
350: $html .= '</p>' . str_repeat('</blockquote>', $offset - $currentOffset) . '<p>';
351: $offset = $currentOffset;
352: }
353:
354: $html .= $matches[2][$i];
355: } else {
356: if ($offset > 0) {
357: $html .= '</p>' . str_repeat('</blockquote>', $offset) . '<p>';
358: $offset = 0;
359: }
360:
361: $html .= $matches[0][$i];
362: }
363: }
364: if ($offset > 0) {
365: $html .= '</p>' . str_repeat('</blockquote>', $offset);
366: }
367:
368:
369: $html = preg_replace('~(?:<br />)+</p></blockquote>~i', '</p></blockquote>', $html);
370: $html = str_ireplace('<p><br /></p>', '', $html);
371: $html = str_ireplace('<p><br />', '<p>', $html);
372:
373:
374: if ($convertLinks) {
375: $html = self::linkFromText($html);
376: }
377:
378: return $html;
379: }
380:
381: 382: 383: 384: 385: 386:
387: public static function linkFromText($text)
388: {
389: $patternGenericTld = '(?:tld|aero|biz|cat|com|coop|edu|gov|info|int|jobs|mil|mobi|museum|name|net|org|pro|tel|travel|asia|post|geo)';
390: $patternTld = '(?-i:' . $patternGenericTld . '|[a-z]{2})';
391: $patternDomain = '(?:(?:[a-z]|[a-z0-9](?:[-a-z0-9]{0,61}[a-z0-9]))[.])*(?:[a-z0-9](?:[-a-z0-9]{0,61}[a-z0-9])[.]' . $patternTld . ')';
392:
393: $pattern8bit = '(?:25[0-5]|2[0-4][0-9]|[0-1]?[0-9]?[0-9])';
394: $patternIPv4 = '(?:' . $pattern8bit . '(?:[.]' . $pattern8bit . '){3})';
395:
396:
397: $patternIpV6Variant8Hex = '(?:(?:[0-9a-f]{1,4}:){7}[0-9a-f]{1,4})';
398:
399: $patternIpV6VariantCompressedHex = '(?:(?:(?:[0-9a-f]{1,4}(?::[0-9a-f]{1,4})*)?)::(?:(?:[0-9a-f]{1,4}(?::[0-9a-f]{1,4})*)?))';
400:
401: $patternIpV6VariantHex4Dec = '(?:(?:(?:[0-9a-f]{1,4}:){6})' . $patternIPv4 . ')';
402:
403: $patternIpV6VariantCompressedHex4Dec = '(?:(?:(?:[0-9a-f]{1,4}(?::[0-9a-f]{1,4})*)?)::(?:(?:[0-9a-f]{1,4}:)*)' . $patternIPv4 . ')';
404: $patternIpV6 = '(?:' . $patternIpV6Variant8Hex . '|' . $patternIpV6VariantCompressedHex . '|' . $patternIpV6VariantHex4Dec . '|' . $patternIpV6VariantCompressedHex4Dec . ')';
405:
406:
407: $patternEmail = '(?:mailto:)?(?:[-\w!#\$%&\'*+/=?^`{|}\~]+(?:[.][-\w!#\$%&\'*+/=?^`{|}\~]+)*)';
408:
409: $patternEmail .= '(?:@' . $patternDomain . ')';
410:
411:
412: $patternUrl = '(?:(?:http|ftp)s?://(?:[\S]+(?:[:][\S]*)?@)?)?';
413:
414: $patternUrl .= '(?:' . $patternDomain . '|' . $patternIPv4 . '|' . $patternIpV6 . ')';
415:
416: $patternUrl .= '(?::[0-9]+)?(?:(?:/[-\w\pL\pN\~.:!%]+)*(?:/|[.][a-z0-9]{2,4})?)?';
417:
418: $patternUrl .= '(?:[?][\]\[-\w\pL\pN.,?!\~%#@&;:/\'\=+]*)?(?:#[\]\[-\w\pL\pN.,?!\~%@&;:/\'\=+]*)?';
419:
420: return preg_replace_callback('~(^|[^\pL\pN])(?:(' . $patternEmail . ')|(' . $patternUrl . '))(?=$|\W)~iu', function($matches) {
421:
422: if (isset($matches[3])) {
423: $url = $matches[3];
424:
425: if (preg_match('~(([.,:;?!>)\]}]|(>))+)$~i', $url, $matches2)) {
426: $punctuation = $matches2[1];
427:
428: $url = mb_substr($url, 0, -strlen($matches2[1]), 'utf-8');
429: } else {
430: $punctuation = '';
431: }
432:
433:
434: $linkUrl = !preg_match('~^(http|ftp)s?://~i', $url) ? 'http://' . $url : $url;
435:
436:
437: return $matches[1] . '<a href="' . $linkUrl . '">' . $url . '</a>' . $punctuation;
438: }
439:
440:
441: if (isset($matches[2])) {
442: $email = $matches[2];
443: if (false !== stripos($email, 'mailto:')) {
444: $email = substr($matches[2], 7);
445: $protocol = 'mailto:';
446: } else {
447: $protocol = '';
448: }
449: return $matches[1] . '<a href="mailto:' . $email . '">' . $protocol . $email . '</a>';
450: }
451: }, $text);
452: }
453:
454: 455: 456: 457: 458: 459:
460: public static function toText($html)
461: {
462: $text = $html;
463:
464:
465:
466: $text = preg_replace_callback('~<pre[^>]*>(.+?)</pre>~is', function($matches) {
467:
468: return nl2br($matches[1]);
469: }, $text);
470:
471: $text = preg_replace(
472: array("~\r~", "~[\n\t]+~", '~<br[^>]*>~i'),
473: array('', ' ', "\n"),
474: $text
475: );
476:
477:
478: static $search = array(
479: '~<h[3-6][^>]*>(.+?)</h[3-6]>~is',
480: '~(<div[^>]*>)|(</div>)~i',
481: '~(<p(?:\s+[^>]+)?>)|(</p>)~i',
482: '~(<table[^>]*>)|(</table>)~i',
483: '~</tr>*~i',
484: '~<td[^>]*>(.+?)</td>~is',
485:
486: '~(…)~i',
487: '~(“)|(”)~i',
488: '~(')~i',
489: '~(©)|(©)~i',
490: '~™~i',
491: '~®~i',
492: '~(—)|(–)~i'
493: );
494: static $replace = array(
495: "\n\n\\1\n\n",
496: "\n\n",
497: "\n\n",
498: "\n\n",
499: "\n",
500: "\\1\t",
501:
502: '...',
503: '"',
504: '\'',
505: '(c)',
506: '(tm)',
507: '(R)',
508: '-'
509: );
510: $text = preg_replace($search, $replace, $text);
511:
512:
513: $text = preg_replace_callback('~<h[12][^>]*>(.+?)</h[12]>~is', function($matches) {
514: return "\n\n\n" . mb_strtoupper($matches[1], 'utf-8') . "\n\n";
515: }, $text);
516:
517: $text = preg_replace_callback('~<strong[^>]*>(.+?)</strong>~is', function($matches) {
518: return mb_strtoupper($matches[1], 'utf-8');
519: }, $text);
520:
521: $text = preg_replace_callback('~<hr[^>]*>~i', function($matches) {
522: return "\n" . str_repeat('-', 50) . "\n";
523: }, $text);
524:
525: $text = preg_replace_callback('~<th[^>]*>(.+?)</th>~is', function($matches) {
526: return mb_strtoupper($matches[1], 'utf-8') . "\t";
527: }, $text);
528:
529: $text = self::linkToText($text);
530:
531: $text = self::listToText($text);
532:
533:
534: $text = trim($text, "\n ");
535: $text = preg_replace("~\n\s+\n~", "\n\n", $text);
536:
537:
538: $text = self::blockquoteToText($text);
539:
540:
541: $text = strip_tags($text);
542:
543:
544: $text = preg_replace('~\[textlink\]\s*~s', '<', $text);
545: $text = preg_replace('~\s*\[/textlink\]~s', '>', $text);
546:
547:
548: $text = preg_replace(array('~ ~i', '~ ~i'), array("\t", ' '), $text);
549:
550:
551:
552: $text = preg_replace_callback('~(&#?[a-z0-9]+;)~i', function($matches) {
553: return html_entity_decode(strtolower($matches[1]), ENT_QUOTES, 'utf-8');
554: }, $text);
555:
556:
557: $text = trim($text, "\n ");
558: $text = preg_replace("~\n\s+\n~", "\n\n", $text);
559:
560: $text = preg_replace("~(\n>\s*)+\n~", "\n>\n", $text);
561:
562:
563: $text = preg_replace("~(\n|\t)( )+~", '\1', $text);
564: $text = preg_replace('~( ){2,}~', ' ', $text);
565:
566:
567: $text = preg_replace("~[ \t]+\n~", "\n", $text);
568:
569: return $text;
570: }
571:
572: 573: 574: 575: 576: 577:
578: private static function linkToText($text)
579: {
580: return preg_replace_callback('~<a\s+(?:[^>]+\s+)*href\s*=\s*"([^"]+)"(?:\s+[^>]*)?>(.+?)</a>~is', function($matches) {
581: $url = trim($matches[1]);
582: $content = $matches[2];
583: $clearContent = trim(strip_tags($content));
584:
585:
586: if ((empty($url)) || ('#' === $url[0]) || ('/?' === substr($url, 0, 2))) {
587: return $content;
588: }
589:
590:
591: if (!Jyxo_Input_Validator_IsUrl::validate($url)) {
592: return $content;
593: }
594:
595:
596: if ($url === $clearContent) {
597: return '[textlink]' . $content . '[/textlink]';
598: } else {
599: return $content . ' [textlink]' . $url . '[/textlink]';
600: }
601: }, $text);
602: }
603:
604: 605: 606: 607: 608: 609:
610: private static function listToText($text)
611: {
612: static $symbols = array('#', '*', 'o', '+');
613:
614: preg_match_all('~(?:<[a-z][a-z0-9]*[^>]*(?: /)?>)|(?:</[a-z][a-z0-9]*>)|(?:<![^>]+>)|(?:[^<]+)~i', $text, $matches);
615: $text = '';
616: $ulLevel = 0;
617: $olLevel = 0;
618: $olLiCount = array();
619: $path = array();
620:
621: foreach ($matches[0] as $textPart) {
622: if (0 === stripos($textPart, '<ol')) {
623: array_push($path, 'ol');
624: $olLevel++;
625: $olLiCount[$olLevel] = 1;
626: $textPart = "\n\n";
627: } elseif ('</ol>' === strtolower($textPart)) {
628: array_pop($path);
629: $olLevel--;
630: $textPart = "\n\n";
631: } elseif (0 === stripos($textPart, '<ul')) {
632: array_push($path, 'ul');
633: $ulLevel++;
634: $textPart = "\n\n";
635: } elseif ('</ul>' === strtolower($textPart)) {
636: array_pop($path);
637: $ulLevel--;
638: $textPart = "\n\n";
639: } elseif (0 === stripos($textPart, '<li')) {
640: $textPart = str_repeat("\t", $olLevel + $ulLevel);
641: if ('ul' === end($path)) {
642: $textPart .= $symbols[$ulLevel % 4] . ' ';
643: } elseif ('ol' === end($path)) {
644: $textPart .= $olLiCount[$olLevel] . '. ';
645: $olLiCount[$olLevel]++;
646: }
647: } elseif ('</li>' === strtolower($textPart)) {
648: $textPart = "\n";
649: }
650:
651: $text .= $textPart;
652: }
653:
654: return $text;
655: }
656:
657: 658: 659: 660: 661: 662:
663: private static function blockquoteToText($text)
664: {
665: if (preg_match_all('~(?:<blockquote[^>]*>\s*)|(?:\s*</blockquote>)|(?:.+?(?=</?blockquote)|(?:.+))~is', $text, $matches) > 0) {
666: $text = '';
667: $offset = 0;
668: foreach ($matches[0] as $textPart) {
669: if (($currentOffset = substr_count(strtolower($textPart), '<blockquote')) > 0) {
670: $offset += $currentOffset;
671: $textPart = ($offset == 1 ? "\n" : '');
672: } elseif (($currentOffset = substr_count(strtolower($textPart), '</blockquote>')) > 0) {
673: $offset -= $currentOffset;
674: $textPart = '';
675: } elseif ($offset > 0) {
676: $textPart = "\n" . str_repeat('>', $offset) . ' '
677: . str_replace("\n", "\n" . str_repeat('>', $offset) . ' ', trim($textPart))
678: . "\n" . str_repeat('>', $offset);
679: }
680:
681: $text .= $textPart;
682: }
683: }
684:
685: return $text;
686: }
687: }
688: