Overview

Namespaces

  • Jyxo
    • Beholder
      • TestCase
    • Gettext
      • Parser
    • Input
      • Chain
      • Filter
      • Validator
    • Mail
      • Email
        • Attachment
      • Parser
      • Sender
    • Rpc
      • Json
      • Xml
    • Shell
    • Spl
    • Svn
    • Time
    • Webdav
  • PHP

Classes

  • Charset
  • Color
  • Css
  • ErrorHandler
  • ErrorMail
  • FirePhp
  • Html
  • HtmlTag
  • SpamFilter
  • String
  • Timer
  • XmlReader

Exceptions

  • Exception
  • Overview
  • Namespace
  • Class
  • Tree
  • Deprecated
  1: <?php
  2: 
  3: /**
  4:  * Jyxo PHP Library
  5:  *
  6:  * LICENSE
  7:  *
  8:  * This source file is subject to the new BSD license that is bundled
  9:  * with this package in the file license.txt.
 10:  * It is also available through the world-wide-web at this URL:
 11:  * https://github.com/jyxo/php/blob/master/license.txt
 12:  */
 13: 
 14: namespace Jyxo;
 15: 
 16: /**
 17:  * Walks through the given text and computes individual words counts. If more than 3/4 words repeat
 18:  * the text is considered to be spam.
 19:  *
 20:  * @category Jyxo
 21:  * @package Jyxo\SpamFilter
 22:  * @copyright Copyright (c) 2005-2011 Jyxo, s.r.o.
 23:  * @license https://github.com/jyxo/php/blob/master/license.txt
 24:  * @author Roman Řáha
 25:  */
 26: class SpamFilter
 27: {
 28:     /**
 29:      * Maximal number of links in the text.
 30:      *
 31:      * @var integer
 32:      */
 33:     const LINK_MAX_COUNT = 10;
 34: 
 35:     /**
 36:      * Maximal number of links in a short text.
 37:      *
 38:      * @var integer
 39:      */
 40:     const LINK_SHORT_MAX_COUNT = 3;
 41: 
 42:     /**
 43:      * Ratio of links number to the total words number in the text.
 44:      *
 45:      * @var float
 46:      */
 47:     const LINK_MAX_RATIO = 0.05;
 48: 
 49:     /**
 50:      * Minimal words count where the links ratio is computed.
 51:      *
 52:      * @var integer
 53:      */
 54:     const LINK_WORDS_MIN_COUNT = 30;
 55: 
 56:     /**
 57:      * Words blacklist.
 58:      *
 59:      * @var array
 60:      */
 61:     private $blackList = array();
 62: 
 63:     /**
 64:      * Ignored words (words are array keys).
 65:      *
 66:      * @var array
 67:      */
 68:     private $ignoreWords = array();
 69: 
 70:     /**
 71:      * Checks if the given text is spam.
 72:      *
 73:      * @param string $text Checked text
 74:      * @return boolean
 75:      */
 76:     public function isSpam($text)
 77:     {
 78:         // Blacklisting first
 79:         if ($this->isBlack($text)) {
 80:             return true;
 81:         }
 82:         // Link count check
 83:         if ($this->isLinkSpam($text)) {
 84:             return true;
 85:         }
 86:         // Words repeat check
 87:         if ($this->isBabble($text)) {
 88:             return true;
 89:         }
 90:         return false;
 91:     }
 92: 
 93:     /**
 94:      * Returns if the given text contains blacklisted words.
 95:      *
 96:      * @param string $text Checked text
 97:      * @return boolean
 98:      */
 99:     public function isBlack($text)
100:     {
101:         foreach ($this->blackList as $black) {
102:             if (false !== strpos($text, $black)) {
103:                 // There is a blacklisted word in the text
104:                 return true;
105:             }
106:         }
107:         return false;
108:     }
109: 
110:     /**
111:      * Returns if the text contains too many links.
112:      *
113:      * @param string $text Checked text
114:      * @return boolean
115:      */
116:     public function isLinkSpam($text)
117:     {
118:         $urlPattern = '~((ftp|http|https)://)?[-\w]+(\.[-\w]+)*\.[a-z]{2,6}~i';
119:         $linkCount = preg_match_all($urlPattern, $text, $matches);
120:         if (self::LINK_MAX_COUNT <= $linkCount) {
121:             // More links than allowed
122:             return true;
123:         }
124:         $wordCount = preg_match_all('~[\pZ\s]+~u', trim($text), $matches) + 1;
125:         if (self::LINK_WORDS_MIN_COUNT >= $wordCount) {
126:             // For short texts use links count check
127:             return self::LINK_SHORT_MAX_COUNT <= $linkCount;
128:         }
129:         // For long texts check links ratio
130:         return self::LINK_MAX_RATIO <= ($linkCount / $wordCount);
131:     }
132: 
133:     /**
134:      * Returns if the text consists of repeating parts.
135:      * Returns true if the number of at least three times repeated words is greater than
136:      * 3/4 of all words.
137:      *
138:      * @param string $text Checked text
139:      * @return boolean
140:      */
141:     public function isBabble($text)
142:     {
143:         $words = array();
144:         $numberOfWords = 0;
145:         // Walk through the text a count word appearances
146:         foreach (preg_split('~[\pZ\s]+~u', $text) as $word) {
147:             $word = mb_strtolower(trim($word), 'utf-8');
148:             // Check if the word is supposed to be ignored
149:             if (!isset($this->ignoreWords[$word])) {
150:                 if (isset($words[$word])) {
151:                     $words[$word]++;
152:                 } else {
153:                     $words[$word] = 1;
154:                 }
155:                 $numberOfWords++;
156:             }
157:         }
158: 
159:         // Count words repeated more than two times
160:         $count = 0;
161:         foreach ($words as $value) {
162:             if ($value > 2) {
163:                 $count += $value;
164:             }
165:         }
166: 
167:         // If the number of repeated words is greater than 3/4 of all words, the text is considered to be spam
168:         return $count > ($numberOfWords * 3 / 4);
169:     }
170: 
171:     /**
172:      * Sets word blacklist.
173:      *
174:      * @param array $blackList Words blacklist
175:      * @return \Jyxo\SpamFilter
176:      */
177:     public function setBlackList(array $blackList)
178:     {
179:         $this->blackList = $blackList;
180:         return $this;
181:     }
182: 
183:     /**
184:      * Sets ignored word list.
185:      *
186:      * @param array $ignoreWords Ignored words list
187:      * @return \Jyxo\SpamFilter
188:      */
189:     public function setIgnoreWords(array $ignoreWords)
190:     {
191:         $this->ignoreWords = $ignoreWords;
192:         return $this;
193:     }
194: }
195: 
Jyxo PHP Library API documentation generated by ApiGen 2.3.0