Changeset 3 for branches/rsr.v5.1.dev/web/punbb/include/search_idx.php
- Timestamp:
- Nov 14, 2011, 11:17:15 PM (13 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
branches/rsr.v5.1.dev/web/punbb/include/search_idx.php
r1 r3 1 1 <?php 2 /*********************************************************************** 3 4 Copyright (C) 2002-2005 Rickard Andersson (rickard@punbb.org) 5 6 This file is part of PunBB. 7 8 PunBB is free software; you can redistribute it and/or modify it 9 under the terms of the GNU General Public License as published 10 by the Free Software Foundation; either version 2 of the License, 11 or (at your option) any later version. 12 13 PunBB is distributed in the hope that it will be useful, but 14 WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with this program; if not, write to the Free Software 20 Foundation, Inc., 59 Temple Place, Suite 330, Boston, 21 MA 02111-1307 USA 22 23 ************************************************************************/ 24 2 3 /** 4 * Copyright (C) 2008-2011 FluxBB 5 * based on code by Rickard Andersson copyright (C) 2002-2008 PunBB 6 * License: http://www.gnu.org/licenses/gpl.html GPL version 2 or higher 7 */ 25 8 26 9 // The contents of this file are very much inspired by the file functions_search.php 27 // from the phpBB Group forum software phpBB2 (http://www.phpbb.com) .10 // from the phpBB Group forum software phpBB2 (http://www.phpbb.com) 28 11 29 12 … … 33 16 34 17 18 // Make a regex that will match CJK or Hangul characters 19 define('PUN_CJK_HANGUL_REGEX', '['. 20 '\x{1100}-\x{11FF}'. // Hangul Jamo 1100-11FF (http://www.fileformat.info/info/unicode/block/hangul_jamo/index.htm) 21 '\x{3130}-\x{318F}'. // Hangul Compatibility Jamo 3130-318F (http://www.fileformat.info/info/unicode/block/hangul_compatibility_jamo/index.htm) 22 '\x{AC00}-\x{D7AF}'. // Hangul Syllables AC00-D7AF (http://www.fileformat.info/info/unicode/block/hangul_syllables/index.htm) 23 24 // Hiragana 25 '\x{3040}-\x{309F}'. // Hiragana 3040-309F (http://www.fileformat.info/info/unicode/block/hiragana/index.htm) 26 27 // Katakana 28 '\x{30A0}-\x{30FF}'. // Katakana 30A0-30FF (http://www.fileformat.info/info/unicode/block/katakana/index.htm) 29 '\x{31F0}-\x{31FF}'. // Katakana Phonetic Extensions 31F0-31FF (http://www.fileformat.info/info/unicode/block/katakana_phonetic_extensions/index.htm) 30 31 // CJK Unified Ideographs (http://en.wikipedia.org/wiki/CJK_Unified_Ideographs) 32 '\x{2E80}-\x{2EFF}'. // CJK Radicals Supplement 2E80-2EFF (http://www.fileformat.info/info/unicode/block/cjk_radicals_supplement/index.htm) 33 '\x{2F00}-\x{2FDF}'. // Kangxi Radicals 2F00-2FDF (http://www.fileformat.info/info/unicode/block/kangxi_radicals/index.htm) 34 '\x{2FF0}-\x{2FFF}'. // Ideographic Description Characters 2FF0-2FFF (http://www.fileformat.info/info/unicode/block/ideographic_description_characters/index.htm) 35 '\x{3000}-\x{303F}'. // CJK Symbols and Punctuation 3000-303F (http://www.fileformat.info/info/unicode/block/cjk_symbols_and_punctuation/index.htm) 36 '\x{31C0}-\x{31EF}'. // CJK Strokes 31C0-31EF (http://www.fileformat.info/info/unicode/block/cjk_strokes/index.htm) 37 '\x{3200}-\x{32FF}'. // Enclosed CJK Letters and Months 3200-32FF (http://www.fileformat.info/info/unicode/block/enclosed_cjk_letters_and_months/index.htm) 38 '\x{3400}-\x{4DBF}'. // CJK Unified Ideographs Extension A 3400-4DBF (http://www.fileformat.info/info/unicode/block/cjk_unified_ideographs_extension_a/index.htm) 39 '\x{4E00}-\x{9FFF}'. // CJK Unified Ideographs 4E00-9FFF (http://www.fileformat.info/info/unicode/block/cjk_unified_ideographs/index.htm) 40 '\x{20000}-\x{2A6DF}'. // CJK Unified Ideographs Extension B 20000-2A6DF (http://www.fileformat.info/info/unicode/block/cjk_unified_ideographs_extension_b/index.htm) 41 ']'); 42 43 35 44 // 36 45 // "Cleans up" a text string and returns an array of unique words 37 46 // This function depends on the current locale setting 38 47 // 39 function split_words($text) 40 { 41 global $pun_user; 42 static $noise_match, $noise_replace, $stopwords; 43 44 if (empty($noise_match)) 45 { 46 $noise_match = array('[quote', '[code', '[url', '[img', '[email', '[color', '[colour', 'quote]', 'code]', 'url]', 'img]', 'email]', 'color]', 'colour]', '^', '$', '&', '(', ')', '<', '>', '`', '\'', '"', '|', ',', '@', '_', '?', '%', '~', '+', '[', ']', '{', '}', ':', '\\', '/', '=', '#', ';', '!', '*'); 47 $noise_replace = array('', '', '', '', '', '', '', '', '', '', '', '', '', '', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '', '', ' ', ' ', ' ', ' ', '', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '' , ' ', ' ', ' ', ' ', ' ', ' '); 48 49 $stopwords = (array)@file(PUN_ROOT.'lang/'.$pun_user['language'].'/stopwords.txt'); 50 $stopwords = array_map('trim', $stopwords); 51 } 52 53 // Clean up 54 $patterns[] = '#&[\#a-z0-9]+?;#i'; 55 $patterns[] = '#\b[\w]+:\/\/[a-z0-9\.\-]+(\/[a-z0-9\?\.%_\-\+=&\/~]+)?#'; 56 $patterns[] = '#\[\/?[a-z\*=\+\-]+(\:?[0-9a-z]+)?:[a-z0-9]{10,}(\:[a-z0-9]+)?=?.*?\]#'; 57 $text = preg_replace($patterns, ' ', ' '.strtolower($text).' '); 58 59 // Filter out junk 60 $text = str_replace($noise_match, $noise_replace, $text); 61 62 // Strip out extra whitespace between words 63 $text = trim(preg_replace('#\s+#', ' ', $text)); 48 function split_words($text, $idx) 49 { 50 // Remove BBCode 51 $text = preg_replace('%\[/?(b|u|s|ins|del|em|i|h|colou?r|quote|code|img|url|email|list)(?:\=[^\]]*)?\]%', ' ', $text); 52 53 // Remove any apostrophes or dashes which aren't part of words 54 $text = substr(ucp_preg_replace('%((?<=[^\p{L}\p{N}])[\'\-]|[\'\-](?=[^\p{L}\p{N}]))%u', '', ' '.$text.' '), 1, -1); 55 56 // Remove punctuation and symbols (actually anything that isn't a letter or number), allow apostrophes and dashes (and % * if we aren't indexing) 57 $text = ucp_preg_replace('%(?![\'\-'.($idx ? '' : '\%\*').'])[^\p{L}\p{N}]+%u', ' ', $text); 58 59 // Replace multiple whitespace or dashes 60 $text = preg_replace('%(\s){2,}%u', '\1', $text); 64 61 65 62 // Fill an array with all the words 66 $words = explode(' ', $text); 67 68 if (!empty($words)) 69 { 70 while (list($i, $word) = @each($words)) 71 { 72 $words[$i] = trim($word, '.'); 73 $num_chars = pun_strlen($word); 74 75 if ($num_chars < 3 || $num_chars > 20 || in_array($word, $stopwords)) 76 unset($words[$i]); 77 } 78 } 79 80 return array_unique($words); 63 $words = array_unique(explode(' ', $text)); 64 65 // Remove any words that should not be indexed 66 foreach ($words as $key => $value) 67 { 68 // If the word shouldn't be indexed, remove it 69 if (!validate_search_word($value, $idx)) 70 unset($words[$key]); 71 } 72 73 return $words; 74 } 75 76 77 // 78 // Checks if a word is a valid searchable word 79 // 80 function validate_search_word($word, $idx) 81 { 82 static $stopwords; 83 84 // If the word is a keyword we don't want to index it, but we do want to be allowed to search it 85 if (is_keyword($word)) 86 return !$idx; 87 88 if (!isset($stopwords)) 89 { 90 if (file_exists(FORUM_CACHE_DIR.'cache_stopwords.php')) 91 include FORUM_CACHE_DIR.'cache_stopwords.php'; 92 93 if (!defined('PUN_STOPWORDS_LOADED')) 94 { 95 if (!defined('FORUM_CACHE_FUNCTIONS_LOADED')) 96 require PUN_ROOT.'include/cache.php'; 97 98 generate_stopwords_cache(); 99 require FORUM_CACHE_DIR.'cache_stopwords.php'; 100 } 101 } 102 103 // If it is a stopword it isn't valid 104 if (in_array($word, $stopwords)) 105 return false; 106 107 // If the word if CJK we don't want to index it, but we do want to be allowed to search it 108 if (is_cjk($word)) 109 return !$idx; 110 111 // Check the word is within the min/max length 112 $num_chars = pun_strlen($word); 113 return $num_chars >= PUN_SEARCH_MIN_WORD && $num_chars <= PUN_SEARCH_MAX_WORD; 114 } 115 116 117 // 118 // Check a given word is a search keyword. 119 // 120 function is_keyword($word) 121 { 122 return $word == 'and' || $word == 'or' || $word == 'not'; 123 } 124 125 126 // 127 // Check if a given word is CJK or Hangul. 128 // 129 function is_cjk($word) 130 { 131 return preg_match('%^'.PUN_CJK_HANGUL_REGEX.'+$%u', $word) ? true : false; 132 } 133 134 135 // 136 // Strip [img] [url] and [email] out of the message so we don't index their contents 137 // 138 function strip_bbcode($text) 139 { 140 static $patterns; 141 142 if (!isset($patterns)) 143 { 144 $patterns = array( 145 '%\[img=([^\]]*+)\]([^[]*+)\[/img\]%' => '$2 $1', // Keep the url and description 146 '%\[(url|email)=([^\]]*+)\]([^[]*+(?:(?!\[/\1\])\[[^[]*+)*)\[/\1\]%' => '$2 $3', // Keep the url and text 147 '%\[(img|url|email)\]([^[]*+(?:(?!\[/\1\])\[[^[]*+)*)\[/\1\]%' => '$2', // Keep the url 148 ); 149 } 150 151 return preg_replace(array_keys($patterns), array_values($patterns), $text); 81 152 } 82 153 … … 89 160 global $db_type, $db; 90 161 162 $message = utf8_strtolower($message); 163 $subject = utf8_strtolower($subject); 164 165 // Remove any bbcode that we shouldn't index 166 $message = strip_bbcode($message); 167 91 168 // Split old and new post/subject to obtain array of 'words' 92 $words_message = split_words($message );93 $words_subject = ($subject) ? split_words($subject ) : array();169 $words_message = split_words($message, true); 170 $words_subject = ($subject) ? split_words($subject, true) : array(); 94 171 95 172 if ($mode == 'edit') 96 173 { 97 $result = $db->query('SELECT w.id, w.word, m.subject_match FROM '.$db->prefix.'search_words AS w INNER JOIN '.$db->prefix.'search_matches AS m ON w.id=m.word_id WHERE m.post_id='.$post_id, true) or error(' Impossible de retrouver les mots index de recherches', __FILE__, __LINE__, $db->error());174 $result = $db->query('SELECT w.id, w.word, m.subject_match FROM '.$db->prefix.'search_words AS w INNER JOIN '.$db->prefix.'search_matches AS m ON w.id=m.word_id WHERE m.post_id='.$post_id, true) or error('Unable to fetch search index words', __FILE__, __LINE__, $db->error()); 98 175 99 176 // Declare here to stop array_keys() and array_diff() from complaining if not set … … 130 207 if (!empty($unique_words)) 131 208 { 132 $result = $db->query('SELECT id, word FROM '.$db->prefix.'search_words WHERE word IN( '.implode(',', preg_replace('#^(.*)$#', '\'\1\'', $unique_words)).')', true) or error('Impossible de retrouver les mots index de recherches', __FILE__, __LINE__, $db->error());209 $result = $db->query('SELECT id, word FROM '.$db->prefix.'search_words WHERE word IN(\''.implode('\',\'', array_map(array($db, 'escape'), $unique_words)).'\')', true) or error('Unable to fetch search index words', __FILE__, __LINE__, $db->error()); 133 210 134 211 $word_ids = array(); … … 147 224 case 'mysql': 148 225 case 'mysqli': 149 $db->query('INSERT INTO '.$db->prefix.'search_words (word) VALUES'.implode(',', preg_replace('#^(.*)$#', '(\'\1\')', $new_words))) or error('Impossible d\'ajouter les mots index de recherche', __FILE__, __LINE__, $db->error()); 226 case 'mysql_innodb': 227 case 'mysqli_innodb': 228 $db->query('INSERT INTO '.$db->prefix.'search_words (word) VALUES(\''.implode('\'),(\'', array_map(array($db, 'escape'), $new_words)).'\')'); 150 229 break; 151 230 152 231 default: 153 while (list(, $word) = @each($new_words))154 $db->query('INSERT INTO '.$db->prefix.'search_words (word) VALUES(\''.$ word.'\')') or error('Impossible d\'ajouter les mots index de recherche', __FILE__, __LINE__, $db->error());232 foreach ($new_words as $word) 233 $db->query('INSERT INTO '.$db->prefix.'search_words (word) VALUES(\''.$db->escape($word).'\')'); 155 234 break; 156 235 } … … 161 240 162 241 // Delete matches (only if editing a post) 163 while (list($match_in, $wordlist) = @each($words['del']))242 foreach ($words['del'] as $match_in => $wordlist) 164 243 { 165 244 $subject_match = ($match_in == 'subject') ? 1 : 0; … … 168 247 { 169 248 $sql = ''; 170 while (list(, $word) = @each($wordlist))249 foreach ($wordlist as $word) 171 250 $sql .= (($sql != '') ? ',' : '').$cur_words[$match_in][$word]; 172 251 173 $db->query('DELETE FROM '.$db->prefix.'search_matches WHERE word_id IN('.$sql.') AND post_id='.$post_id.' AND subject_match='.$subject_match) or error(' Impossible de supprimer des mots index de recherche', __FILE__, __LINE__, $db->error());252 $db->query('DELETE FROM '.$db->prefix.'search_matches WHERE word_id IN('.$sql.') AND post_id='.$post_id.' AND subject_match='.$subject_match) or error('Unable to delete search index word matches', __FILE__, __LINE__, $db->error()); 174 253 } 175 254 } 176 255 177 256 // Add new matches 178 while (list($match_in, $wordlist) = @each($words['add']))257 foreach ($words['add'] as $match_in => $wordlist) 179 258 { 180 259 $subject_match = ($match_in == 'subject') ? 1 : 0; 181 260 182 261 if (!empty($wordlist)) 183 $db->query('INSERT INTO '.$db->prefix.'search_matches (post_id, word_id, subject_match) SELECT '.$post_id.', id, '.$subject_match.' FROM '.$db->prefix.'search_words WHERE word IN( '.implode(',', preg_replace('#^(.*)$#', '\'\1\'', $wordlist)).')') or error('Impossible d\'ajouter les correspondances index de recherche', __FILE__, __LINE__, $db->error());262 $db->query('INSERT INTO '.$db->prefix.'search_matches (post_id, word_id, subject_match) SELECT '.$post_id.', id, '.$subject_match.' FROM '.$db->prefix.'search_words WHERE word IN(\''.implode('\',\'', array_map(array($db, 'escape'), $wordlist)).'\')') or error('Unable to insert search index word matches', __FILE__, __LINE__, $db->error()); 184 263 } 185 264 … … 199 278 case 'mysql': 200 279 case 'mysqli': 201 { 202 $result = $db->query('SELECT word_id FROM '.$db->prefix.'search_matches WHERE post_id IN('.$post_ids.') GROUP BY word_id') or error('Impossible de retrouver les correspondances de mots index de recherche', __FILE__, __LINE__, $db->error()); 280 case 'mysql_innodb': 281 case 'mysqli_innodb': 282 { 283 $result = $db->query('SELECT word_id FROM '.$db->prefix.'search_matches WHERE post_id IN('.$post_ids.') GROUP BY word_id') or error('Unable to fetch search index word match', __FILE__, __LINE__, $db->error()); 203 284 204 285 if ($db->num_rows($result)) … … 208 289 $word_ids .= ($word_ids != '') ? ','.$row[0] : $row[0]; 209 290 210 $result = $db->query('SELECT word_id FROM '.$db->prefix.'search_matches WHERE word_id IN('.$word_ids.') GROUP BY word_id HAVING COUNT(word_id)=1') or error(' Impossible de retrouver les correspondances de mots index de recherche', __FILE__, __LINE__, $db->error());291 $result = $db->query('SELECT word_id FROM '.$db->prefix.'search_matches WHERE word_id IN('.$word_ids.') GROUP BY word_id HAVING COUNT(word_id)=1') or error('Unable to fetch search index word match', __FILE__, __LINE__, $db->error()); 211 292 212 293 if ($db->num_rows($result)) … … 216 297 $word_ids .= ($word_ids != '') ? ','.$row[0] : $row[0]; 217 298 218 $db->query('DELETE FROM '.$db->prefix.'search_words WHERE id IN('.$word_ids.')') or error(' Impossible de supprimer des mots index de recherche', __FILE__, __LINE__, $db->error());299 $db->query('DELETE FROM '.$db->prefix.'search_words WHERE id IN('.$word_ids.')') or error('Unable to delete search index word', __FILE__, __LINE__, $db->error()); 219 300 } 220 301 } … … 224 305 225 306 default: 226 $db->query('DELETE FROM '.$db->prefix.'search_words WHERE id IN(SELECT word_id FROM '.$db->prefix.'search_matches WHERE word_id IN(SELECT word_id FROM '.$db->prefix.'search_matches WHERE post_id IN('.$post_ids.') GROUP BY word_id) GROUP BY word_id HAVING COUNT(word_id)=1)') or error(' Impossible de supprimer depuis l\'index de recherche', __FILE__, __LINE__, $db->error());307 $db->query('DELETE FROM '.$db->prefix.'search_words WHERE id IN(SELECT word_id FROM '.$db->prefix.'search_matches WHERE word_id IN(SELECT word_id FROM '.$db->prefix.'search_matches WHERE post_id IN('.$post_ids.') GROUP BY word_id) GROUP BY word_id HAVING COUNT(word_id)=1)') or error('Unable to delete from search index', __FILE__, __LINE__, $db->error()); 227 308 break; 228 309 } 229 310 230 $db->query('DELETE FROM '.$db->prefix.'search_matches WHERE post_id IN('.$post_ids.')') or error(' Impossible de supprimer des correspondances de mots index de recherche', __FILE__, __LINE__, $db->error());231 } 311 $db->query('DELETE FROM '.$db->prefix.'search_matches WHERE post_id IN('.$post_ids.')') or error('Unable to delete search index word match', __FILE__, __LINE__, $db->error()); 312 }
Note: See TracChangeset
for help on using the changeset viewer.