1: <?php
2:
3: /**
4: * Jyxo PHP Library
5: *
6: * LICENSE
7: *
8: * This source file is subject to the new BSD license that is bundled
9: * with this package in the file license.txt.
10: * It is also available through the world-wide-web at this URL:
11: * https://github.com/jyxo/php/blob/master/license.txt
12: */
13:
14: /**
15: * Walks through the given text and computes individual words counts. If more than 3/4 words repeat
16: * the text is considered to be spam.
17: *
18: * @category Jyxo
19: * @package Jyxo_SpamFilter
20: * @copyright Copyright (c) 2005-2011 Jyxo, s.r.o.
21: * @license https://github.com/jyxo/php/blob/master/license.txt
22: * @author Roman Řáha
23: */
24: class Jyxo_SpamFilter
25: {
26: /**
27: * Maximal number of links in the text.
28: *
29: * @var integer
30: */
31: const LINK_MAX_COUNT = 10;
32:
33: /**
34: * Maximal number of links in a short text.
35: *
36: * @var integer
37: */
38: const LINK_SHORT_MAX_COUNT = 3;
39:
40: /**
41: * Ratio of links number to the total words number in the text.
42: *
43: * @var float
44: */
45: const LINK_MAX_RATIO = 0.05;
46:
47: /**
48: * Minimal words count where the links ratio is computed.
49: *
50: * @var integer
51: */
52: const LINK_WORDS_MIN_COUNT = 30;
53:
54: /**
55: * Words blacklist.
56: *
57: * @var array
58: */
59: private $blackList = array();
60:
61: /**
62: * Ignored words (words are array keys).
63: *
64: * @var array
65: */
66: private $ignoreWords = array();
67:
68: /**
69: * Checks if the given text is spam.
70: *
71: * @param string $text Checked text
72: * @return boolean
73: */
74: public function isSpam($text)
75: {
76: // Blacklisting first
77: if ($this->isBlack($text)) {
78: return true;
79: }
80: // Link count check
81: if ($this->isLinkSpam($text)) {
82: return true;
83: }
84: // Words repeat check
85: if ($this->isBabble($text)) {
86: return true;
87: }
88: return false;
89: }
90:
91: /**
92: * Returns if the given text contains blacklisted words.
93: *
94: * @param string $text Checked text
95: * @return boolean
96: */
97: public function isBlack($text)
98: {
99: foreach ($this->blackList as $black) {
100: if (false !== strpos($text, $black)) {
101: // There is a blacklisted word in the text
102: return true;
103: }
104: }
105: return false;
106: }
107:
108: /**
109: * Returns if the text contains too many links.
110: *
111: * @param string $text Checked text
112: * @return boolean
113: */
114: public function isLinkSpam($text)
115: {
116: $urlPattern = '~((ftp|http|https)://)?[-\w]+(\.[-\w]+)*\.[a-z]{2,6}~i';
117: $linkCount = preg_match_all($urlPattern, $text, $matches);
118: if (self::LINK_MAX_COUNT <= $linkCount) {
119: // More links than allowed
120: return true;
121: }
122: $wordCount = preg_match_all('~[\pZ\s]+~u', trim($text), $matches) + 1;
123: if (self::LINK_WORDS_MIN_COUNT >= $wordCount) {
124: // For short texts use links count check
125: return self::LINK_SHORT_MAX_COUNT <= $linkCount;
126: }
127: // For long texts check links ratio
128: return self::LINK_MAX_RATIO <= ($linkCount / $wordCount);
129: }
130:
131: /**
132: * Returns if the text consists of repeating parts.
133: * Returns true if the number of at least three times repeated words is greater than
134: * 3/4 of all words.
135: *
136: * @param string $text Checked text
137: * @return boolean
138: */
139: public function isBabble($text)
140: {
141: $words = array();
142: $numberOfWords = 0;
143: // Walk through the text a count word appearances
144: foreach (preg_split('~[\pZ\s]+~u', $text) as $word) {
145: $word = mb_strtolower(trim($word), 'utf-8');
146: // Check if the word is supposed to be ignored
147: if (!isset($this->ignoreWords[$word])) {
148: if (isset($words[$word])) {
149: $words[$word]++;
150: } else {
151: $words[$word] = 1;
152: }
153: $numberOfWords++;
154: }
155: }
156:
157: // Count words repeated more than two times
158: $count = 0;
159: foreach ($words as $value) {
160: if ($value > 2) {
161: $count += $value;
162: }
163: }
164:
165: // If the number of repeated words is greater than 3/4 of all words, the text is considered to be spam
166: return $count > ($numberOfWords * 3 / 4);
167: }
168:
169: /**
170: * Sets word blacklist.
171: *
172: * @param array $blackList Words blacklist
173: * @return Jyxo_SpamFilter
174: */
175: public function setBlackList(array $blackList)
176: {
177: $this->blackList = $blackList;
178: return $this;
179: }
180:
181: /**
182: * Sets ignored word list.
183: *
184: * @param array $ignoreWords Ignored words list
185: * @return Jyxo_SpamFilter
186: */
187: public function setIgnoreWords(array $ignoreWords)
188: {
189: $this->ignoreWords = $ignoreWords;
190: return $this;
191: }
192: }
193: