[6] | 1 | <?php |
---|
| 2 | |
---|
| 3 | /** |
---|
| 4 | * Copyright (C) 2008-2011 FluxBB |
---|
| 5 | * based on code by Rickard Andersson copyright (C) 2002-2008 PunBB |
---|
| 6 | * License: http://www.gnu.org/licenses/gpl.html GPL version 2 or higher |
---|
| 7 | */ |
---|
| 8 | |
---|
| 9 | // The contents of this file are very much inspired by the file functions_search.php |
---|
| 10 | // from the phpBB Group forum software phpBB2 (http://www.phpbb.com) |
---|
| 11 | |
---|
| 12 | |
---|
| 13 | // Make sure no one attempts to run this script "directly" |
---|
| 14 | if (!defined('PUN')) |
---|
| 15 | exit; |
---|
| 16 | |
---|
| 17 | |
---|
| 18 | // Make a regex that will match CJK or Hangul characters |
---|
| 19 | define('PUN_CJK_HANGUL_REGEX', '['. |
---|
| 20 | '\x{1100}-\x{11FF}'. // Hangul Jamo 1100-11FF (http://www.fileformat.info/info/unicode/block/hangul_jamo/index.htm) |
---|
| 21 | '\x{3130}-\x{318F}'. // Hangul Compatibility Jamo 3130-318F (http://www.fileformat.info/info/unicode/block/hangul_compatibility_jamo/index.htm) |
---|
| 22 | '\x{AC00}-\x{D7AF}'. // Hangul Syllables AC00-D7AF (http://www.fileformat.info/info/unicode/block/hangul_syllables/index.htm) |
---|
| 23 | |
---|
| 24 | // Hiragana |
---|
| 25 | '\x{3040}-\x{309F}'. // Hiragana 3040-309F (http://www.fileformat.info/info/unicode/block/hiragana/index.htm) |
---|
| 26 | |
---|
| 27 | // Katakana |
---|
| 28 | '\x{30A0}-\x{30FF}'. // Katakana 30A0-30FF (http://www.fileformat.info/info/unicode/block/katakana/index.htm) |
---|
| 29 | '\x{31F0}-\x{31FF}'. // Katakana Phonetic Extensions 31F0-31FF (http://www.fileformat.info/info/unicode/block/katakana_phonetic_extensions/index.htm) |
---|
| 30 | |
---|
| 31 | // CJK Unified Ideographs (http://en.wikipedia.org/wiki/CJK_Unified_Ideographs) |
---|
| 32 | '\x{2E80}-\x{2EFF}'. // CJK Radicals Supplement 2E80-2EFF (http://www.fileformat.info/info/unicode/block/cjk_radicals_supplement/index.htm) |
---|
| 33 | '\x{2F00}-\x{2FDF}'. // Kangxi Radicals 2F00-2FDF (http://www.fileformat.info/info/unicode/block/kangxi_radicals/index.htm) |
---|
| 34 | '\x{2FF0}-\x{2FFF}'. // Ideographic Description Characters 2FF0-2FFF (http://www.fileformat.info/info/unicode/block/ideographic_description_characters/index.htm) |
---|
| 35 | '\x{3000}-\x{303F}'. // CJK Symbols and Punctuation 3000-303F (http://www.fileformat.info/info/unicode/block/cjk_symbols_and_punctuation/index.htm) |
---|
| 36 | '\x{31C0}-\x{31EF}'. // CJK Strokes 31C0-31EF (http://www.fileformat.info/info/unicode/block/cjk_strokes/index.htm) |
---|
| 37 | '\x{3200}-\x{32FF}'. // Enclosed CJK Letters and Months 3200-32FF (http://www.fileformat.info/info/unicode/block/enclosed_cjk_letters_and_months/index.htm) |
---|
| 38 | '\x{3400}-\x{4DBF}'. // CJK Unified Ideographs Extension A 3400-4DBF (http://www.fileformat.info/info/unicode/block/cjk_unified_ideographs_extension_a/index.htm) |
---|
| 39 | '\x{4E00}-\x{9FFF}'. // CJK Unified Ideographs 4E00-9FFF (http://www.fileformat.info/info/unicode/block/cjk_unified_ideographs/index.htm) |
---|
| 40 | '\x{20000}-\x{2A6DF}'. // CJK Unified Ideographs Extension B 20000-2A6DF (http://www.fileformat.info/info/unicode/block/cjk_unified_ideographs_extension_b/index.htm) |
---|
| 41 | ']'); |
---|
| 42 | |
---|
| 43 | |
---|
| 44 | // |
---|
| 45 | // "Cleans up" a text string and returns an array of unique words |
---|
| 46 | // This function depends on the current locale setting |
---|
| 47 | // |
---|
| 48 | function split_words($text, $idx) |
---|
| 49 | { |
---|
| 50 | // Remove BBCode |
---|
| 51 | $text = preg_replace('%\[/?(b|u|s|ins|del|em|i|h|colou?r|quote|code|img|url|email|list)(?:\=[^\]]*)?\]%', ' ', $text); |
---|
| 52 | |
---|
| 53 | // Remove any apostrophes or dashes which aren't part of words |
---|
| 54 | $text = substr(ucp_preg_replace('%((?<=[^\p{L}\p{N}])[\'\-]|[\'\-](?=[^\p{L}\p{N}]))%u', '', ' '.$text.' '), 1, -1); |
---|
| 55 | |
---|
| 56 | // Remove punctuation and symbols (actually anything that isn't a letter or number), allow apostrophes and dashes (and % * if we aren't indexing) |
---|
| 57 | $text = ucp_preg_replace('%(?![\'\-'.($idx ? '' : '\%\*').'])[^\p{L}\p{N}]+%u', ' ', $text); |
---|
| 58 | |
---|
| 59 | // Replace multiple whitespace or dashes |
---|
| 60 | $text = preg_replace('%(\s){2,}%u', '\1', $text); |
---|
| 61 | |
---|
| 62 | // Fill an array with all the words |
---|
| 63 | $words = array_unique(explode(' ', $text)); |
---|
| 64 | |
---|
| 65 | // Remove any words that should not be indexed |
---|
| 66 | foreach ($words as $key => $value) |
---|
| 67 | { |
---|
| 68 | // If the word shouldn't be indexed, remove it |
---|
| 69 | if (!validate_search_word($value, $idx)) |
---|
| 70 | unset($words[$key]); |
---|
| 71 | } |
---|
| 72 | |
---|
| 73 | return $words; |
---|
| 74 | } |
---|
| 75 | |
---|
| 76 | |
---|
| 77 | // |
---|
| 78 | // Checks if a word is a valid searchable word |
---|
| 79 | // |
---|
| 80 | function validate_search_word($word, $idx) |
---|
| 81 | { |
---|
| 82 | static $stopwords; |
---|
| 83 | |
---|
| 84 | // If the word is a keyword we don't want to index it, but we do want to be allowed to search it |
---|
| 85 | if (is_keyword($word)) |
---|
| 86 | return !$idx; |
---|
| 87 | |
---|
| 88 | if (!isset($stopwords)) |
---|
| 89 | { |
---|
| 90 | if (file_exists(FORUM_CACHE_DIR.'cache_stopwords.php')) |
---|
| 91 | include FORUM_CACHE_DIR.'cache_stopwords.php'; |
---|
| 92 | |
---|
| 93 | if (!defined('PUN_STOPWORDS_LOADED')) |
---|
| 94 | { |
---|
| 95 | if (!defined('FORUM_CACHE_FUNCTIONS_LOADED')) |
---|
| 96 | require PUN_ROOT.'include/cache.php'; |
---|
| 97 | |
---|
| 98 | generate_stopwords_cache(); |
---|
| 99 | require FORUM_CACHE_DIR.'cache_stopwords.php'; |
---|
| 100 | } |
---|
| 101 | } |
---|
| 102 | |
---|
| 103 | // If it is a stopword it isn't valid |
---|
| 104 | if (in_array($word, $stopwords)) |
---|
| 105 | return false; |
---|
| 106 | |
---|
| 107 | // If the word if CJK we don't want to index it, but we do want to be allowed to search it |
---|
| 108 | if (is_cjk($word)) |
---|
| 109 | return !$idx; |
---|
| 110 | |
---|
| 111 | // Check the word is within the min/max length |
---|
| 112 | $num_chars = pun_strlen($word); |
---|
| 113 | return $num_chars >= PUN_SEARCH_MIN_WORD && $num_chars <= PUN_SEARCH_MAX_WORD; |
---|
| 114 | } |
---|
| 115 | |
---|
| 116 | |
---|
| 117 | // |
---|
| 118 | // Check a given word is a search keyword. |
---|
| 119 | // |
---|
| 120 | function is_keyword($word) |
---|
| 121 | { |
---|
| 122 | return $word == 'and' || $word == 'or' || $word == 'not'; |
---|
| 123 | } |
---|
| 124 | |
---|
| 125 | |
---|
| 126 | // |
---|
| 127 | // Check if a given word is CJK or Hangul. |
---|
| 128 | // |
---|
| 129 | function is_cjk($word) |
---|
| 130 | { |
---|
| 131 | return preg_match('%^'.PUN_CJK_HANGUL_REGEX.'+$%u', $word) ? true : false; |
---|
| 132 | } |
---|
| 133 | |
---|
| 134 | |
---|
| 135 | // |
---|
| 136 | // Strip [img] [url] and [email] out of the message so we don't index their contents |
---|
| 137 | // |
---|
| 138 | function strip_bbcode($text) |
---|
| 139 | { |
---|
| 140 | static $patterns; |
---|
| 141 | |
---|
| 142 | if (!isset($patterns)) |
---|
| 143 | { |
---|
| 144 | $patterns = array( |
---|
| 145 | '%\[img=([^\]]*+)\]([^[]*+)\[/img\]%' => '$2 $1', // Keep the url and description |
---|
| 146 | '%\[(url|email)=([^\]]*+)\]([^[]*+(?:(?!\[/\1\])\[[^[]*+)*)\[/\1\]%' => '$2 $3', // Keep the url and text |
---|
| 147 | '%\[(img|url|email)\]([^[]*+(?:(?!\[/\1\])\[[^[]*+)*)\[/\1\]%' => '$2', // Keep the url |
---|
| 148 | ); |
---|
| 149 | } |
---|
| 150 | |
---|
| 151 | return preg_replace(array_keys($patterns), array_values($patterns), $text); |
---|
| 152 | } |
---|
| 153 | |
---|
| 154 | |
---|
| 155 | // |
---|
| 156 | // Updates the search index with the contents of $post_id (and $subject) |
---|
| 157 | // |
---|
| 158 | function update_search_index($mode, $post_id, $message, $subject = null) |
---|
| 159 | { |
---|
| 160 | global $db_type, $db; |
---|
| 161 | |
---|
| 162 | $message = utf8_strtolower($message); |
---|
| 163 | $subject = utf8_strtolower($subject); |
---|
| 164 | |
---|
| 165 | // Remove any bbcode that we shouldn't index |
---|
| 166 | $message = strip_bbcode($message); |
---|
| 167 | |
---|
| 168 | // Split old and new post/subject to obtain array of 'words' |
---|
| 169 | $words_message = split_words($message, true); |
---|
| 170 | $words_subject = ($subject) ? split_words($subject, true) : array(); |
---|
| 171 | |
---|
| 172 | if ($mode == 'edit') |
---|
| 173 | { |
---|
| 174 | $result = $db->query('SELECT w.id, w.word, m.subject_match FROM '.$db->prefix.'search_words AS w INNER JOIN '.$db->prefix.'search_matches AS m ON w.id=m.word_id WHERE m.post_id='.$post_id, true) or error('Unable to fetch search index words', __FILE__, __LINE__, $db->error()); |
---|
| 175 | |
---|
| 176 | // Declare here to stop array_keys() and array_diff() from complaining if not set |
---|
| 177 | $cur_words['post'] = array(); |
---|
| 178 | $cur_words['subject'] = array(); |
---|
| 179 | |
---|
| 180 | while ($row = $db->fetch_row($result)) |
---|
| 181 | { |
---|
| 182 | $match_in = ($row[2]) ? 'subject' : 'post'; |
---|
| 183 | $cur_words[$match_in][$row[1]] = $row[0]; |
---|
| 184 | } |
---|
| 185 | |
---|
| 186 | $db->free_result($result); |
---|
| 187 | |
---|
| 188 | $words['add']['post'] = array_diff($words_message, array_keys($cur_words['post'])); |
---|
| 189 | $words['add']['subject'] = array_diff($words_subject, array_keys($cur_words['subject'])); |
---|
| 190 | $words['del']['post'] = array_diff(array_keys($cur_words['post']), $words_message); |
---|
| 191 | $words['del']['subject'] = array_diff(array_keys($cur_words['subject']), $words_subject); |
---|
| 192 | } |
---|
| 193 | else |
---|
| 194 | { |
---|
| 195 | $words['add']['post'] = $words_message; |
---|
| 196 | $words['add']['subject'] = $words_subject; |
---|
| 197 | $words['del']['post'] = array(); |
---|
| 198 | $words['del']['subject'] = array(); |
---|
| 199 | } |
---|
| 200 | |
---|
| 201 | unset($words_message); |
---|
| 202 | unset($words_subject); |
---|
| 203 | |
---|
| 204 | // Get unique words from the above arrays |
---|
| 205 | $unique_words = array_unique(array_merge($words['add']['post'], $words['add']['subject'])); |
---|
| 206 | |
---|
| 207 | if (!empty($unique_words)) |
---|
| 208 | { |
---|
| 209 | $result = $db->query('SELECT id, word FROM '.$db->prefix.'search_words WHERE word IN(\''.implode('\',\'', array_map(array($db, 'escape'), $unique_words)).'\')', true) or error('Unable to fetch search index words', __FILE__, __LINE__, $db->error()); |
---|
| 210 | |
---|
| 211 | $word_ids = array(); |
---|
| 212 | while ($row = $db->fetch_row($result)) |
---|
| 213 | $word_ids[$row[1]] = $row[0]; |
---|
| 214 | |
---|
| 215 | $db->free_result($result); |
---|
| 216 | |
---|
| 217 | $new_words = array_diff($unique_words, array_keys($word_ids)); |
---|
| 218 | unset($unique_words); |
---|
| 219 | |
---|
| 220 | if (!empty($new_words)) |
---|
| 221 | { |
---|
| 222 | switch ($db_type) |
---|
| 223 | { |
---|
| 224 | case 'mysql': |
---|
| 225 | case 'mysqli': |
---|
| 226 | case 'mysql_innodb': |
---|
| 227 | case 'mysqli_innodb': |
---|
| 228 | $db->query('INSERT INTO '.$db->prefix.'search_words (word) VALUES(\''.implode('\'),(\'', array_map(array($db, 'escape'), $new_words)).'\')'); |
---|
| 229 | break; |
---|
| 230 | |
---|
| 231 | default: |
---|
| 232 | foreach ($new_words as $word) |
---|
| 233 | $db->query('INSERT INTO '.$db->prefix.'search_words (word) VALUES(\''.$db->escape($word).'\')'); |
---|
| 234 | break; |
---|
| 235 | } |
---|
| 236 | } |
---|
| 237 | |
---|
| 238 | unset($new_words); |
---|
| 239 | } |
---|
| 240 | |
---|
| 241 | // Delete matches (only if editing a post) |
---|
| 242 | foreach ($words['del'] as $match_in => $wordlist) |
---|
| 243 | { |
---|
| 244 | $subject_match = ($match_in == 'subject') ? 1 : 0; |
---|
| 245 | |
---|
| 246 | if (!empty($wordlist)) |
---|
| 247 | { |
---|
| 248 | $sql = ''; |
---|
| 249 | foreach ($wordlist as $word) |
---|
| 250 | $sql .= (($sql != '') ? ',' : '').$cur_words[$match_in][$word]; |
---|
| 251 | |
---|
| 252 | $db->query('DELETE FROM '.$db->prefix.'search_matches WHERE word_id IN('.$sql.') AND post_id='.$post_id.' AND subject_match='.$subject_match) or error('Unable to delete search index word matches', __FILE__, __LINE__, $db->error()); |
---|
| 253 | } |
---|
| 254 | } |
---|
| 255 | |
---|
| 256 | // Add new matches |
---|
| 257 | foreach ($words['add'] as $match_in => $wordlist) |
---|
| 258 | { |
---|
| 259 | $subject_match = ($match_in == 'subject') ? 1 : 0; |
---|
| 260 | |
---|
| 261 | if (!empty($wordlist)) |
---|
| 262 | $db->query('INSERT INTO '.$db->prefix.'search_matches (post_id, word_id, subject_match) SELECT '.$post_id.', id, '.$subject_match.' FROM '.$db->prefix.'search_words WHERE word IN(\''.implode('\',\'', array_map(array($db, 'escape'), $wordlist)).'\')') or error('Unable to insert search index word matches', __FILE__, __LINE__, $db->error()); |
---|
| 263 | } |
---|
| 264 | |
---|
| 265 | unset($words); |
---|
| 266 | } |
---|
| 267 | |
---|
| 268 | |
---|
| 269 | // |
---|
| 270 | // Strip search index of indexed words in $post_ids |
---|
| 271 | // |
---|
| 272 | function strip_search_index($post_ids) |
---|
| 273 | { |
---|
| 274 | global $db_type, $db; |
---|
| 275 | |
---|
| 276 | switch ($db_type) |
---|
| 277 | { |
---|
| 278 | case 'mysql': |
---|
| 279 | case 'mysqli': |
---|
| 280 | case 'mysql_innodb': |
---|
| 281 | case 'mysqli_innodb': |
---|
| 282 | { |
---|
| 283 | $result = $db->query('SELECT word_id FROM '.$db->prefix.'search_matches WHERE post_id IN('.$post_ids.') GROUP BY word_id') or error('Unable to fetch search index word match', __FILE__, __LINE__, $db->error()); |
---|
| 284 | |
---|
| 285 | if ($db->num_rows($result)) |
---|
| 286 | { |
---|
| 287 | $word_ids = ''; |
---|
| 288 | while ($row = $db->fetch_row($result)) |
---|
| 289 | $word_ids .= ($word_ids != '') ? ','.$row[0] : $row[0]; |
---|
| 290 | |
---|
| 291 | $result = $db->query('SELECT word_id FROM '.$db->prefix.'search_matches WHERE word_id IN('.$word_ids.') GROUP BY word_id HAVING COUNT(word_id)=1') or error('Unable to fetch search index word match', __FILE__, __LINE__, $db->error()); |
---|
| 292 | |
---|
| 293 | if ($db->num_rows($result)) |
---|
| 294 | { |
---|
| 295 | $word_ids = ''; |
---|
| 296 | while ($row = $db->fetch_row($result)) |
---|
| 297 | $word_ids .= ($word_ids != '') ? ','.$row[0] : $row[0]; |
---|
| 298 | |
---|
| 299 | $db->query('DELETE FROM '.$db->prefix.'search_words WHERE id IN('.$word_ids.')') or error('Unable to delete search index word', __FILE__, __LINE__, $db->error()); |
---|
| 300 | } |
---|
| 301 | } |
---|
| 302 | |
---|
| 303 | break; |
---|
| 304 | } |
---|
| 305 | |
---|
| 306 | default: |
---|
| 307 | $db->query('DELETE FROM '.$db->prefix.'search_words WHERE id IN(SELECT word_id FROM '.$db->prefix.'search_matches WHERE word_id IN(SELECT word_id FROM '.$db->prefix.'search_matches WHERE post_id IN('.$post_ids.') GROUP BY word_id) GROUP BY word_id HAVING COUNT(word_id)=1)') or error('Unable to delete from search index', __FILE__, __LINE__, $db->error()); |
---|
| 308 | break; |
---|
| 309 | } |
---|
| 310 | |
---|
| 311 | $db->query('DELETE FROM '.$db->prefix.'search_matches WHERE post_id IN('.$post_ids.')') or error('Unable to delete search index word match', __FILE__, __LINE__, $db->error()); |
---|
| 312 | } |
---|