1: <?php
2:
3: /**
4: * Jyxo PHP Library
5: *
6: * LICENSE
7: *
8: * This source file is subject to the new BSD license that is bundled
9: * with this package in the file license.txt.
10: * It is also available through the world-wide-web at this URL:
11: * https://github.com/jyxo/php/blob/master/license.txt
12: */
13:
14: namespace Jyxo;
15:
16: /**
17: * Walks through the given text and computes individual words counts. If more than 3/4 words repeat
18: * the text is considered to be spam.
19: *
20: * @category Jyxo
21: * @package Jyxo\SpamFilter
22: * @copyright Copyright (c) 2005-2011 Jyxo, s.r.o.
23: * @license https://github.com/jyxo/php/blob/master/license.txt
24: * @author Roman Řáha
25: */
26: class SpamFilter
27: {
28: /**
29: * Maximal number of links in the text.
30: *
31: * @var integer
32: */
33: const LINK_MAX_COUNT = 10;
34:
35: /**
36: * Maximal number of links in a short text.
37: *
38: * @var integer
39: */
40: const LINK_SHORT_MAX_COUNT = 3;
41:
42: /**
43: * Ratio of links number to the total words number in the text.
44: *
45: * @var float
46: */
47: const LINK_MAX_RATIO = 0.05;
48:
49: /**
50: * Minimal words count where the links ratio is computed.
51: *
52: * @var integer
53: */
54: const LINK_WORDS_MIN_COUNT = 30;
55:
56: /**
57: * Words blacklist.
58: *
59: * @var array
60: */
61: private $blackList = array();
62:
63: /**
64: * Ignored words (words are array keys).
65: *
66: * @var array
67: */
68: private $ignoreWords = array();
69:
70: /**
71: * Checks if the given text is spam.
72: *
73: * @param string $text Checked text
74: * @return boolean
75: */
76: public function isSpam($text)
77: {
78: // Blacklisting first
79: if ($this->isBlack($text)) {
80: return true;
81: }
82: // Link count check
83: if ($this->isLinkSpam($text)) {
84: return true;
85: }
86: // Words repeat check
87: if ($this->isBabble($text)) {
88: return true;
89: }
90: return false;
91: }
92:
93: /**
94: * Returns if the given text contains blacklisted words.
95: *
96: * @param string $text Checked text
97: * @return boolean
98: */
99: public function isBlack($text)
100: {
101: foreach ($this->blackList as $black) {
102: if (false !== strpos($text, $black)) {
103: // There is a blacklisted word in the text
104: return true;
105: }
106: }
107: return false;
108: }
109:
110: /**
111: * Returns if the text contains too many links.
112: *
113: * @param string $text Checked text
114: * @return boolean
115: */
116: public function isLinkSpam($text)
117: {
118: $urlPattern = '~((ftp|http|https)://)?[-\w]+(\.[-\w]+)*\.[a-z]{2,6}~i';
119: $linkCount = preg_match_all($urlPattern, $text, $matches);
120: if (self::LINK_MAX_COUNT <= $linkCount) {
121: // More links than allowed
122: return true;
123: }
124: $wordCount = preg_match_all('~[\pZ\s]+~u', trim($text), $matches) + 1;
125: if (self::LINK_WORDS_MIN_COUNT >= $wordCount) {
126: // For short texts use links count check
127: return self::LINK_SHORT_MAX_COUNT <= $linkCount;
128: }
129: // For long texts check links ratio
130: return self::LINK_MAX_RATIO <= ($linkCount / $wordCount);
131: }
132:
133: /**
134: * Returns if the text consists of repeating parts.
135: * Returns true if the number of at least three times repeated words is greater than
136: * 3/4 of all words.
137: *
138: * @param string $text Checked text
139: * @return boolean
140: */
141: public function isBabble($text)
142: {
143: $words = array();
144: $numberOfWords = 0;
145: // Walk through the text a count word appearances
146: foreach (preg_split('~[\pZ\s]+~u', $text) as $word) {
147: $word = mb_strtolower(trim($word), 'utf-8');
148: // Check if the word is supposed to be ignored
149: if (!isset($this->ignoreWords[$word])) {
150: if (isset($words[$word])) {
151: $words[$word]++;
152: } else {
153: $words[$word] = 1;
154: }
155: $numberOfWords++;
156: }
157: }
158:
159: // Count words repeated more than two times
160: $count = 0;
161: foreach ($words as $value) {
162: if ($value > 2) {
163: $count += $value;
164: }
165: }
166:
167: // If the number of repeated words is greater than 3/4 of all words, the text is considered to be spam
168: return $count > ($numberOfWords * 3 / 4);
169: }
170:
171: /**
172: * Sets word blacklist.
173: *
174: * @param array $blackList Words blacklist
175: * @return \Jyxo\SpamFilter
176: */
177: public function setBlackList(array $blackList)
178: {
179: $this->blackList = $blackList;
180: return $this;
181: }
182:
183: /**
184: * Sets ignored word list.
185: *
186: * @param array $ignoreWords Ignored words list
187: * @return \Jyxo\SpamFilter
188: */
189: public function setIgnoreWords(array $ignoreWords)
190: {
191: $this->ignoreWords = $ignoreWords;
192: return $this;
193: }
194: }
195: