Overview

Packages

  • Jyxo_Beholder
  • Jyxo_Charset
  • Jyxo_Color
  • Jyxo_Css
  • Jyxo_ErrorHandling
  • Jyxo_FirePhp
  • Jyxo_Gettext
    • Parser
  • Jyxo_Html
  • Jyxo_Input
    • Chain
    • Filter
    • Validator
  • Jyxo_Mail
    • Email
    • Parser
    • Sender
  • Jyxo_Rpc
    • Json
    • Xml
  • Jyxo_Shell
  • Jyxo_SpamFilter
  • Jyxo_Spl
  • Jyxo_String
  • Jyxo_Svn
  • Jyxo_Time
  • Jyxo_Timer
  • Jyxo_Webdav
  • Jyxo_XmlReader
  • PHP

Classes

  • Jyxo_SpamFilter
  • Overview
  • Package
  • Class
  • Tree
  • Deprecated
  1: <?php
  2: 
  3: /**
  4:  * Jyxo PHP Library
  5:  *
  6:  * LICENSE
  7:  *
  8:  * This source file is subject to the new BSD license that is bundled
  9:  * with this package in the file license.txt.
 10:  * It is also available through the world-wide-web at this URL:
 11:  * https://github.com/jyxo/php/blob/master/license.txt
 12:  */
 13: 
 14: /**
 15:  * Walks through the given text and computes individual words counts. If more than 3/4 words repeat
 16:  * the text is considered to be spam.
 17:  *
 18:  * @category Jyxo
 19:  * @package Jyxo_SpamFilter
 20:  * @copyright Copyright (c) 2005-2011 Jyxo, s.r.o.
 21:  * @license https://github.com/jyxo/php/blob/master/license.txt
 22:  * @author Roman Řáha
 23:  */
 24: class Jyxo_SpamFilter
 25: {
 26:     /**
 27:      * Maximal number of links in the text.
 28:      *
 29:      * @var integer
 30:      */
 31:     const LINK_MAX_COUNT = 10;
 32: 
 33:     /**
 34:      * Maximal number of links in a short text.
 35:      *
 36:      * @var integer
 37:      */
 38:     const LINK_SHORT_MAX_COUNT = 3;
 39: 
 40:     /**
 41:      * Ratio of links number to the total words number in the text.
 42:      *
 43:      * @var float
 44:      */
 45:     const LINK_MAX_RATIO = 0.05;
 46: 
 47:     /**
 48:      * Minimal words count where the links ratio is computed.
 49:      *
 50:      * @var integer
 51:      */
 52:     const LINK_WORDS_MIN_COUNT = 30;
 53: 
 54:     /**
 55:      * Words blacklist.
 56:      *
 57:      * @var array
 58:      */
 59:     private $blackList = array();
 60: 
 61:     /**
 62:      * Ignored words (words are array keys).
 63:      *
 64:      * @var array
 65:      */
 66:     private $ignoreWords = array();
 67: 
 68:     /**
 69:      * Checks if the given text is spam.
 70:      *
 71:      * @param string $text Checked text
 72:      * @return boolean
 73:      */
 74:     public function isSpam($text)
 75:     {
 76:         // Blacklisting first
 77:         if ($this->isBlack($text)) {
 78:             return true;
 79:         }
 80:         // Link count check
 81:         if ($this->isLinkSpam($text)) {
 82:             return true;
 83:         }
 84:         // Words repeat check
 85:         if ($this->isBabble($text)) {
 86:             return true;
 87:         }
 88:         return false;
 89:     }
 90: 
 91:     /**
 92:      * Returns if the given text contains blacklisted words.
 93:      *
 94:      * @param string $text Checked text
 95:      * @return boolean
 96:      */
 97:     public function isBlack($text)
 98:     {
 99:         foreach ($this->blackList as $black) {
100:             if (false !== strpos($text, $black)) {
101:                 // There is a blacklisted word in the text
102:                 return true;
103:             }
104:         }
105:         return false;
106:     }
107: 
108:     /**
109:      * Returns if the text contains too many links.
110:      *
111:      * @param string $text Checked text
112:      * @return boolean
113:      */
114:     public function isLinkSpam($text)
115:     {
116:         $urlPattern = '~((ftp|http|https)://)?[-\w]+(\.[-\w]+)*\.[a-z]{2,6}~i';
117:         $linkCount = preg_match_all($urlPattern, $text, $matches);
118:         if (self::LINK_MAX_COUNT <= $linkCount) {
119:             // More links than allowed
120:             return true;
121:         }
122:         $wordCount = preg_match_all('~[\pZ\s]+~u', trim($text), $matches) + 1;
123:         if (self::LINK_WORDS_MIN_COUNT >= $wordCount) {
124:             // For short texts use links count check
125:             return self::LINK_SHORT_MAX_COUNT <= $linkCount;
126:         }
127:         // For long texts check links ratio
128:         return self::LINK_MAX_RATIO <= ($linkCount / $wordCount);
129:     }
130: 
131:     /**
132:      * Returns if the text consists of repeating parts.
133:      * Returns true if the number of at least three times repeated words is greater than
134:      * 3/4 of all words.
135:      *
136:      * @param string $text Checked text
137:      * @return boolean
138:      */
139:     public function isBabble($text)
140:     {
141:         $words = array();
142:         $numberOfWords = 0;
143:         // Walk through the text a count word appearances
144:         foreach (preg_split('~[\pZ\s]+~u', $text) as $word) {
145:             $word = mb_strtolower(trim($word), 'utf-8');
146:             // Check if the word is supposed to be ignored
147:             if (!isset($this->ignoreWords[$word])) {
148:                 if (isset($words[$word])) {
149:                     $words[$word]++;
150:                 } else {
151:                     $words[$word] = 1;
152:                 }
153:                 $numberOfWords++;
154:             }
155:         }
156: 
157:         // Count words repeated more than two times
158:         $count = 0;
159:         foreach ($words as $value) {
160:             if ($value > 2) {
161:                 $count += $value;
162:             }
163:         }
164: 
165:         // If the number of repeated words is greater than 3/4 of all words, the text is considered to be spam
166:         return $count > ($numberOfWords * 3 / 4);
167:     }
168: 
169:     /**
170:      * Sets word blacklist.
171:      *
172:      * @param array $blackList Words blacklist
173:      * @return Jyxo_SpamFilter
174:      */
175:     public function setBlackList(array $blackList)
176:     {
177:         $this->blackList = $blackList;
178:         return $this;
179:     }
180: 
181:     /**
182:      * Sets ignored word list.
183:      *
184:      * @param array $ignoreWords Ignored words list
185:      * @return Jyxo_SpamFilter
186:      */
187:     public function setIgnoreWords(array $ignoreWords)
188:     {
189:         $this->ignoreWords = $ignoreWords;
190:         return $this;
191:     }
192: }
193: 
Jyxo PHP Library API documentation generated by ApiGen 2.3.0