1 | <?php |
---|
2 | |
---|
3 | /** |
---|
4 | * Copyright (C) 2008-2011 FluxBB |
---|
5 | * based on code by Rickard Andersson copyright (C) 2002-2008 PunBB |
---|
6 | * License: http://www.gnu.org/licenses/gpl.html GPL version 2 or higher |
---|
7 | */ |
---|
8 | |
---|
9 | // The contents of this file are very much inspired by the file functions_search.php |
---|
10 | // from the phpBB Group forum software phpBB2 (http://www.phpbb.com) |
---|
11 | |
---|
12 | |
---|
13 | // Make sure no one attempts to run this script "directly" |
---|
14 | if (!defined('PUN')) |
---|
15 | exit; |
---|
16 | |
---|
17 | |
---|
18 | // Make a regex that will match CJK or Hangul characters |
---|
19 | define('PUN_CJK_HANGUL_REGEX', '['. |
---|
20 | '\x{1100}-\x{11FF}'. // Hangul Jamo 1100-11FF (http://www.fileformat.info/info/unicode/block/hangul_jamo/index.htm) |
---|
21 | '\x{3130}-\x{318F}'. // Hangul Compatibility Jamo 3130-318F (http://www.fileformat.info/info/unicode/block/hangul_compatibility_jamo/index.htm) |
---|
22 | '\x{AC00}-\x{D7AF}'. // Hangul Syllables AC00-D7AF (http://www.fileformat.info/info/unicode/block/hangul_syllables/index.htm) |
---|
23 | |
---|
24 | // Hiragana |
---|
25 | '\x{3040}-\x{309F}'. // Hiragana 3040-309F (http://www.fileformat.info/info/unicode/block/hiragana/index.htm) |
---|
26 | |
---|
27 | // Katakana |
---|
28 | '\x{30A0}-\x{30FF}'. // Katakana 30A0-30FF (http://www.fileformat.info/info/unicode/block/katakana/index.htm) |
---|
29 | '\x{31F0}-\x{31FF}'. // Katakana Phonetic Extensions 31F0-31FF (http://www.fileformat.info/info/unicode/block/katakana_phonetic_extensions/index.htm) |
---|
30 | |
---|
31 | // CJK Unified Ideographs (http://en.wikipedia.org/wiki/CJK_Unified_Ideographs) |
---|
32 | '\x{2E80}-\x{2EFF}'. // CJK Radicals Supplement 2E80-2EFF (http://www.fileformat.info/info/unicode/block/cjk_radicals_supplement/index.htm) |
---|
33 | '\x{2F00}-\x{2FDF}'. // Kangxi Radicals 2F00-2FDF (http://www.fileformat.info/info/unicode/block/kangxi_radicals/index.htm) |
---|
34 | '\x{2FF0}-\x{2FFF}'. // Ideographic Description Characters 2FF0-2FFF (http://www.fileformat.info/info/unicode/block/ideographic_description_characters/index.htm) |
---|
35 | '\x{3000}-\x{303F}'. // CJK Symbols and Punctuation 3000-303F (http://www.fileformat.info/info/unicode/block/cjk_symbols_and_punctuation/index.htm) |
---|
36 | '\x{31C0}-\x{31EF}'. // CJK Strokes 31C0-31EF (http://www.fileformat.info/info/unicode/block/cjk_strokes/index.htm) |
---|
37 | '\x{3200}-\x{32FF}'. // Enclosed CJK Letters and Months 3200-32FF (http://www.fileformat.info/info/unicode/block/enclosed_cjk_letters_and_months/index.htm) |
---|
38 | '\x{3400}-\x{4DBF}'. // CJK Unified Ideographs Extension A 3400-4DBF (http://www.fileformat.info/info/unicode/block/cjk_unified_ideographs_extension_a/index.htm) |
---|
39 | '\x{4E00}-\x{9FFF}'. // CJK Unified Ideographs 4E00-9FFF (http://www.fileformat.info/info/unicode/block/cjk_unified_ideographs/index.htm) |
---|
40 | '\x{20000}-\x{2A6DF}'. // CJK Unified Ideographs Extension B 20000-2A6DF (http://www.fileformat.info/info/unicode/block/cjk_unified_ideographs_extension_b/index.htm) |
---|
41 | ']'); |
---|
42 | |
---|
43 | |
---|
44 | // |
---|
45 | // "Cleans up" a text string and returns an array of unique words |
---|
46 | // This function depends on the current locale setting |
---|
47 | // |
---|
48 | function split_words($text, $idx) |
---|
49 | { |
---|
50 | // Remove BBCode |
---|
51 | $text = preg_replace('%\[/?(b|u|s|ins|del|em|i|h|colou?r|quote|code|img|url|email|list)(?:\=[^\]]*)?\]%', ' ', $text); |
---|
52 | |
---|
53 | // Remove any apostrophes or dashes which aren't part of words |
---|
54 | $text = substr(ucp_preg_replace('%((?<=[^\p{L}\p{N}])[\'\-]|[\'\-](?=[^\p{L}\p{N}]))%u', '', ' '.$text.' '), 1, -1); |
---|
55 | |
---|
56 | // Remove punctuation and symbols (actually anything that isn't a letter or number), allow apostrophes and dashes (and % * if we aren't indexing) |
---|
57 | $text = ucp_preg_replace('%(?![\'\-'.($idx ? '' : '\%\*').'])[^\p{L}\p{N}]+%u', ' ', $text); |
---|
58 | |
---|
59 | // Replace multiple whitespace or dashes |
---|
60 | $text = preg_replace('%(\s){2,}%u', '\1', $text); |
---|
61 | |
---|
62 | // Fill an array with all the words |
---|
63 | $words = array_unique(explode(' ', $text)); |
---|
64 | |
---|
65 | // Remove any words that should not be indexed |
---|
66 | foreach ($words as $key => $value) |
---|
67 | { |
---|
68 | // If the word shouldn't be indexed, remove it |
---|
69 | if (!validate_search_word($value, $idx)) |
---|
70 | unset($words[$key]); |
---|
71 | } |
---|
72 | |
---|
73 | return $words; |
---|
74 | } |
---|
75 | |
---|
76 | |
---|
77 | // |
---|
78 | // Checks if a word is a valid searchable word |
---|
79 | // |
---|
80 | function validate_search_word($word, $idx) |
---|
81 | { |
---|
82 | static $stopwords; |
---|
83 | |
---|
84 | // If the word is a keyword we don't want to index it, but we do want to be allowed to search it |
---|
85 | if (is_keyword($word)) |
---|
86 | return !$idx; |
---|
87 | |
---|
88 | if (!isset($stopwords)) |
---|
89 | { |
---|
90 | if (file_exists(FORUM_CACHE_DIR.'cache_stopwords.php')) |
---|
91 | include FORUM_CACHE_DIR.'cache_stopwords.php'; |
---|
92 | |
---|
93 | if (!defined('PUN_STOPWORDS_LOADED')) |
---|
94 | { |
---|
95 | if (!defined('FORUM_CACHE_FUNCTIONS_LOADED')) |
---|
96 | require PUN_ROOT.'include/cache.php'; |
---|
97 | |
---|
98 | generate_stopwords_cache(); |
---|
99 | require FORUM_CACHE_DIR.'cache_stopwords.php'; |
---|
100 | } |
---|
101 | } |
---|
102 | |
---|
103 | // If it is a stopword it isn't valid |
---|
104 | if (in_array($word, $stopwords)) |
---|
105 | return false; |
---|
106 | |
---|
107 | // If the word if CJK we don't want to index it, but we do want to be allowed to search it |
---|
108 | if (is_cjk($word)) |
---|
109 | return !$idx; |
---|
110 | |
---|
111 | // Check the word is within the min/max length |
---|
112 | $num_chars = pun_strlen($word); |
---|
113 | return $num_chars >= PUN_SEARCH_MIN_WORD && $num_chars <= PUN_SEARCH_MAX_WORD; |
---|
114 | } |
---|
115 | |
---|
116 | |
---|
117 | // |
---|
118 | // Check a given word is a search keyword. |
---|
119 | // |
---|
120 | function is_keyword($word) |
---|
121 | { |
---|
122 | return $word == 'and' || $word == 'or' || $word == 'not'; |
---|
123 | } |
---|
124 | |
---|
125 | |
---|
126 | // |
---|
127 | // Check if a given word is CJK or Hangul. |
---|
128 | // |
---|
129 | function is_cjk($word) |
---|
130 | { |
---|
131 | return preg_match('%^'.PUN_CJK_HANGUL_REGEX.'+$%u', $word) ? true : false; |
---|
132 | } |
---|
133 | |
---|
134 | |
---|
135 | // |
---|
136 | // Strip [img] [url] and [email] out of the message so we don't index their contents |
---|
137 | // |
---|
138 | function strip_bbcode($text) |
---|
139 | { |
---|
140 | static $patterns; |
---|
141 | |
---|
142 | if (!isset($patterns)) |
---|
143 | { |
---|
144 | $patterns = array( |
---|
145 | '%\[img=([^\]]*+)\]([^[]*+)\[/img\]%' => '$2 $1', // Keep the url and description |
---|
146 | '%\[(url|email)=([^\]]*+)\]([^[]*+(?:(?!\[/\1\])\[[^[]*+)*)\[/\1\]%' => '$2 $3', // Keep the url and text |
---|
147 | '%\[(img|url|email)\]([^[]*+(?:(?!\[/\1\])\[[^[]*+)*)\[/\1\]%' => '$2', // Keep the url |
---|
148 | ); |
---|
149 | } |
---|
150 | |
---|
151 | return preg_replace(array_keys($patterns), array_values($patterns), $text); |
---|
152 | } |
---|
153 | |
---|
154 | |
---|
155 | // |
---|
156 | // Updates the search index with the contents of $post_id (and $subject) |
---|
157 | // |
---|
158 | function update_search_index($mode, $post_id, $message, $subject = null) |
---|
159 | { |
---|
160 | global $db_type, $db; |
---|
161 | |
---|
162 | $message = utf8_strtolower($message); |
---|
163 | $subject = utf8_strtolower($subject); |
---|
164 | |
---|
165 | // Remove any bbcode that we shouldn't index |
---|
166 | $message = strip_bbcode($message); |
---|
167 | |
---|
168 | // Split old and new post/subject to obtain array of 'words' |
---|
169 | $words_message = split_words($message, true); |
---|
170 | $words_subject = ($subject) ? split_words($subject, true) : array(); |
---|
171 | |
---|
172 | if ($mode == 'edit') |
---|
173 | { |
---|
174 | $result = $db->query('SELECT w.id, w.word, m.subject_match FROM '.$db->prefix.'search_words AS w INNER JOIN '.$db->prefix.'search_matches AS m ON w.id=m.word_id WHERE m.post_id='.$post_id, true) or error('Unable to fetch search index words', __FILE__, __LINE__, $db->error()); |
---|
175 | |
---|
176 | // Declare here to stop array_keys() and array_diff() from complaining if not set |
---|
177 | $cur_words['post'] = array(); |
---|
178 | $cur_words['subject'] = array(); |
---|
179 | |
---|
180 | while ($row = $db->fetch_row($result)) |
---|
181 | { |
---|
182 | $match_in = ($row[2]) ? 'subject' : 'post'; |
---|
183 | $cur_words[$match_in][$row[1]] = $row[0]; |
---|
184 | } |
---|
185 | |
---|
186 | $db->free_result($result); |
---|
187 | |
---|
188 | $words['add']['post'] = array_diff($words_message, array_keys($cur_words['post'])); |
---|
189 | $words['add']['subject'] = array_diff($words_subject, array_keys($cur_words['subject'])); |
---|
190 | $words['del']['post'] = array_diff(array_keys($cur_words['post']), $words_message); |
---|
191 | $words['del']['subject'] = array_diff(array_keys($cur_words['subject']), $words_subject); |
---|
192 | } |
---|
193 | else |
---|
194 | { |
---|
195 | $words['add']['post'] = $words_message; |
---|
196 | $words['add']['subject'] = $words_subject; |
---|
197 | $words['del']['post'] = array(); |
---|
198 | $words['del']['subject'] = array(); |
---|
199 | } |
---|
200 | |
---|
201 | unset($words_message); |
---|
202 | unset($words_subject); |
---|
203 | |
---|
204 | // Get unique words from the above arrays |
---|
205 | $unique_words = array_unique(array_merge($words['add']['post'], $words['add']['subject'])); |
---|
206 | |
---|
207 | if (!empty($unique_words)) |
---|
208 | { |
---|
209 | $result = $db->query('SELECT id, word FROM '.$db->prefix.'search_words WHERE word IN(\''.implode('\',\'', array_map(array($db, 'escape'), $unique_words)).'\')', true) or error('Unable to fetch search index words', __FILE__, __LINE__, $db->error()); |
---|
210 | |
---|
211 | $word_ids = array(); |
---|
212 | while ($row = $db->fetch_row($result)) |
---|
213 | $word_ids[$row[1]] = $row[0]; |
---|
214 | |
---|
215 | $db->free_result($result); |
---|
216 | |
---|
217 | $new_words = array_diff($unique_words, array_keys($word_ids)); |
---|
218 | unset($unique_words); |
---|
219 | |
---|
220 | if (!empty($new_words)) |
---|
221 | { |
---|
222 | switch ($db_type) |
---|
223 | { |
---|
224 | case 'mysql': |
---|
225 | case 'mysqli': |
---|
226 | case 'mysql_innodb': |
---|
227 | case 'mysqli_innodb': |
---|
228 | $db->query('INSERT INTO '.$db->prefix.'search_words (word) VALUES(\''.implode('\'),(\'', array_map(array($db, 'escape'), $new_words)).'\')'); |
---|
229 | break; |
---|
230 | |
---|
231 | default: |
---|
232 | foreach ($new_words as $word) |
---|
233 | $db->query('INSERT INTO '.$db->prefix.'search_words (word) VALUES(\''.$db->escape($word).'\')'); |
---|
234 | break; |
---|
235 | } |
---|
236 | } |
---|
237 | |
---|
238 | unset($new_words); |
---|
239 | } |
---|
240 | |
---|
241 | // Delete matches (only if editing a post) |
---|
242 | foreach ($words['del'] as $match_in => $wordlist) |
---|
243 | { |
---|
244 | $subject_match = ($match_in == 'subject') ? 1 : 0; |
---|
245 | |
---|
246 | if (!empty($wordlist)) |
---|
247 | { |
---|
248 | $sql = ''; |
---|
249 | foreach ($wordlist as $word) |
---|
250 | $sql .= (($sql != '') ? ',' : '').$cur_words[$match_in][$word]; |
---|
251 | |
---|
252 | $db->query('DELETE FROM '.$db->prefix.'search_matches WHERE word_id IN('.$sql.') AND post_id='.$post_id.' AND subject_match='.$subject_match) or error('Unable to delete search index word matches', __FILE__, __LINE__, $db->error()); |
---|
253 | } |
---|
254 | } |
---|
255 | |
---|
256 | // Add new matches |
---|
257 | foreach ($words['add'] as $match_in => $wordlist) |
---|
258 | { |
---|
259 | $subject_match = ($match_in == 'subject') ? 1 : 0; |
---|
260 | |
---|
261 | if (!empty($wordlist)) |
---|
262 | $db->query('INSERT INTO '.$db->prefix.'search_matches (post_id, word_id, subject_match) SELECT '.$post_id.', id, '.$subject_match.' FROM '.$db->prefix.'search_words WHERE word IN(\''.implode('\',\'', array_map(array($db, 'escape'), $wordlist)).'\')') or error('Unable to insert search index word matches', __FILE__, __LINE__, $db->error()); |
---|
263 | } |
---|
264 | |
---|
265 | unset($words); |
---|
266 | } |
---|
267 | |
---|
268 | |
---|
269 | // |
---|
270 | // Strip search index of indexed words in $post_ids |
---|
271 | // |
---|
272 | function strip_search_index($post_ids) |
---|
273 | { |
---|
274 | global $db_type, $db; |
---|
275 | |
---|
276 | switch ($db_type) |
---|
277 | { |
---|
278 | case 'mysql': |
---|
279 | case 'mysqli': |
---|
280 | case 'mysql_innodb': |
---|
281 | case 'mysqli_innodb': |
---|
282 | { |
---|
283 | $result = $db->query('SELECT word_id FROM '.$db->prefix.'search_matches WHERE post_id IN('.$post_ids.') GROUP BY word_id') or error('Unable to fetch search index word match', __FILE__, __LINE__, $db->error()); |
---|
284 | |
---|
285 | if ($db->num_rows($result)) |
---|
286 | { |
---|
287 | $word_ids = ''; |
---|
288 | while ($row = $db->fetch_row($result)) |
---|
289 | $word_ids .= ($word_ids != '') ? ','.$row[0] : $row[0]; |
---|
290 | |
---|
291 | $result = $db->query('SELECT word_id FROM '.$db->prefix.'search_matches WHERE word_id IN('.$word_ids.') GROUP BY word_id HAVING COUNT(word_id)=1') or error('Unable to fetch search index word match', __FILE__, __LINE__, $db->error()); |
---|
292 | |
---|
293 | if ($db->num_rows($result)) |
---|
294 | { |
---|
295 | $word_ids = ''; |
---|
296 | while ($row = $db->fetch_row($result)) |
---|
297 | $word_ids .= ($word_ids != '') ? ','.$row[0] : $row[0]; |
---|
298 | |
---|
299 | $db->query('DELETE FROM '.$db->prefix.'search_words WHERE id IN('.$word_ids.')') or error('Unable to delete search index word', __FILE__, __LINE__, $db->error()); |
---|
300 | } |
---|
301 | } |
---|
302 | |
---|
303 | break; |
---|
304 | } |
---|
305 | |
---|
306 | default: |
---|
307 | $db->query('DELETE FROM '.$db->prefix.'search_words WHERE id IN(SELECT word_id FROM '.$db->prefix.'search_matches WHERE word_id IN(SELECT word_id FROM '.$db->prefix.'search_matches WHERE post_id IN('.$post_ids.') GROUP BY word_id) GROUP BY word_id HAVING COUNT(word_id)=1)') or error('Unable to delete from search index', __FILE__, __LINE__, $db->error()); |
---|
308 | break; |
---|
309 | } |
---|
310 | |
---|
311 | $db->query('DELETE FROM '.$db->prefix.'search_matches WHERE post_id IN('.$post_ids.')') or error('Unable to delete search index word match', __FILE__, __LINE__, $db->error()); |
---|
312 | } |
---|