diff -r de56132c008d -r bdac73ed481e includes/search.php --- a/includes/search.php Sun Mar 28 21:49:26 2010 -0400 +++ b/includes/search.php Sun Mar 28 23:10:46 2010 -0400 @@ -25,63 +25,63 @@ class Searcher { - var $results; - var $index; - var $warnings; - var $match_case = false; + var $results; + var $index; + var $warnings; + var $match_case = false; - function buildIndex($texts) - { - $this->index = Array(); - $stopwords = get_stopwords(); + function buildIndex($texts) + { + $this->index = Array(); + $stopwords = get_stopwords(); - foreach($texts as $i => $l) - { - $seed = md5(microtime(true) . mt_rand()); - $texts[$i] = str_replace("'", 'xxxApoS'.$seed.'xxx', $texts[$i]); - $texts[$i] = preg_replace('#([\W_]+)#i', ' ', $texts[$i]); - $texts[$i] = preg_replace('#([ ]+?)#', ' ', $texts[$i]); - $texts[$i] = preg_replace('#([\']*){2,}#s', '', $texts[$i]); - $texts[$i] = str_replace('xxxApoS'.$seed.'xxx', "'", $texts[$i]); - $l = $texts[$i]; - $words = Array(); - $good_chars = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789\' '; - $good_chars = enano_str_split($good_chars, 1); - $letters = enano_str_split($l, 1); - foreach($letters as $x => $t) - { - if(!in_array($t, $good_chars)) - unset($letters[$x]); - } - $letters = implode('', $letters); - $words = explode(' ', $letters); - foreach($words as $c => $w) - { - if(strlen($w) < 2 || in_array($w, $stopwords) || strlen($w) > 63 || preg_match('/[\']{2,}/', $w)) - unset($words[$c]); - else - $words[$c] = $w; - } - $words = array_values($words); - foreach($words as $c => $w) - { - if(isset($this->index[$w])) - { - if(!in_array($i, $this->index[$w])) - $this->index[$w][] = $i; - } - else - { - $this->index[$w] = Array(); - $this->index[$w][] = $i; - } - } - } - foreach($this->index as $k => $v) - { - $this->index[$k] = implode(',', $this->index[$k]); - } - } + foreach($texts as $i => $l) + { + $seed = md5(microtime(true) . mt_rand()); + $texts[$i] = str_replace("'", 'xxxApoS'.$seed.'xxx', $texts[$i]); + $texts[$i] = preg_replace('#([\W_]+)#i', ' ', $texts[$i]); + $texts[$i] = preg_replace('#([ ]+?)#', ' ', $texts[$i]); + $texts[$i] = preg_replace('#([\']*){2,}#s', '', $texts[$i]); + $texts[$i] = str_replace('xxxApoS'.$seed.'xxx', "'", $texts[$i]); + $l = $texts[$i]; + $words = Array(); + $good_chars = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789\' '; + $good_chars = enano_str_split($good_chars, 1); + $letters = enano_str_split($l, 1); + foreach($letters as $x => $t) + { + if(!in_array($t, $good_chars)) + unset($letters[$x]); + } + $letters = implode('', $letters); + $words = explode(' ', $letters); + foreach($words as $c => $w) + { + if(strlen($w) < 2 || in_array($w, $stopwords) || strlen($w) > 63 || preg_match('/[\']{2,}/', $w)) + unset($words[$c]); + else + $words[$c] = $w; + } + $words = array_values($words); + foreach($words as $c => $w) + { + if(isset($this->index[$w])) + { + if(!in_array($i, $this->index[$w])) + $this->index[$w][] = $i; + } + else + { + $this->index[$w] = Array(); + $this->index[$w][] = $i; + } + } + } + foreach($this->index as $k => $v) + { + $this->index[$k] = implode(',', $this->index[$k]); + } + } } /** @@ -100,486 +100,486 @@ function perform_search($query, &$warnings, $case_sensitive = false, &$word_list) { - global $db, $session, $paths, $template, $plugins; // Common objects - global $lang; - - $warnings = array(); - - // - // STAGE 0: PARSE SEARCH QUERY - // Identify all terms of the query. Separate between what is required and what is not, and what should be sent through the index as - // opposed to straight-out LIKE-selected. - // + global $db, $session, $paths, $template, $plugins; // Common objects + global $lang; + + $warnings = array(); + + // + // STAGE 0: PARSE SEARCH QUERY + // Identify all terms of the query. Separate between what is required and what is not, and what should be sent through the index as + // opposed to straight-out LIKE-selected. + // - $query = parse_search_query($query, $warnings); + $query = parse_search_query($query, $warnings); - // Segregate search terms containing spaces - $query_phrase = array( - 'any' => array(), - 'req' => array() - ); + // Segregate search terms containing spaces + $query_phrase = array( + 'any' => array(), + 'req' => array() + ); - foreach ( $query['any'] as $i => $_ ) - { - $term =& $query['any'][$i]; - $term = trim($term); - // the indexer only indexes words a-z with apostrophes - if ( preg_match('/[^A-Za-z\']/', $term) ) - { - $query_phrase['any'][] = $term; - unset($term, $query['any'][$i]); - } - } - unset($term); - $query['any'] = array_values($query['any']); + foreach ( $query['any'] as $i => $_ ) + { + $term =& $query['any'][$i]; + $term = trim($term); + // the indexer only indexes words a-z with apostrophes + if ( preg_match('/[^A-Za-z\']/', $term) ) + { + $query_phrase['any'][] = $term; + unset($term, $query['any'][$i]); + } + } + unset($term); + $query['any'] = array_values($query['any']); - foreach ( $query['req'] as $i => $_ ) - { - $term =& $query['req'][$i]; - $term = trim($term); - if ( preg_match('/[^A-Za-z\']/', $term) ) - { - $query_phrase['req'][] = $term; - unset($term, $query['req'][$i]); - } - } - unset($term); - $query['req'] = array_values($query['req']); + foreach ( $query['req'] as $i => $_ ) + { + $term =& $query['req'][$i]; + $term = trim($term); + if ( preg_match('/[^A-Za-z\']/', $term) ) + { + $query_phrase['req'][] = $term; + unset($term, $query['req'][$i]); + } + } + unset($term); + $query['req'] = array_values($query['req']); - $results = array(); - $scores = array(); - $ns_list = '(' . implode('|', array_keys($paths->nslist)) . ')'; + $results = array(); + $scores = array(); + $ns_list = '(' . implode('|', array_keys($paths->nslist)) . ')'; - // FIXME: Update to use FULLTEXT algo when available. + // FIXME: Update to use FULLTEXT algo when available. - // Build an SQL query to load from the index table - if ( count($query['any']) < 1 && count($query['req']) < 1 && count($query_phrase['any']) < 1 && count($query_phrase['req']) < 1 ) - { - // This is both because of technical restrictions and devastation that would occur on shared servers/large sites. - $warnings[] = $lang->get('search_err_query_no_positive'); - return array(); - } + // Build an SQL query to load from the index table + if ( count($query['any']) < 1 && count($query['req']) < 1 && count($query_phrase['any']) < 1 && count($query_phrase['req']) < 1 ) + { + // This is both because of technical restrictions and devastation that would occur on shared servers/large sites. + $warnings[] = $lang->get('search_err_query_no_positive'); + return array(); + } - // - // STAGE 1 - // Get all possible result pages from the search index. Tally which pages have the most words, and later sort them by boolean relevance - // + // + // STAGE 1 + // Get all possible result pages from the search index. Tally which pages have the most words, and later sort them by boolean relevance + // - // Skip this if no indexable words are included + // Skip this if no indexable words are included - if ( count($query['any']) > 0 || count($query['req']) > 0 ) - { - $where_any = array(); - foreach ( $query['any'] as $term ) - { - $term = escape_string_like($term); - if ( !$case_sensitive ) - $term = strtolower($term); - $where_any[] = $term; - } - foreach ( $query['req'] as $term ) - { - $term = escape_string_like($term); - if ( !$case_sensitive ) - $term = strtolower($term); - $where_any[] = $term; - } + if ( count($query['any']) > 0 || count($query['req']) > 0 ) + { + $where_any = array(); + foreach ( $query['any'] as $term ) + { + $term = escape_string_like($term); + if ( !$case_sensitive ) + $term = strtolower($term); + $where_any[] = $term; + } + foreach ( $query['req'] as $term ) + { + $term = escape_string_like($term); + if ( !$case_sensitive ) + $term = strtolower($term); + $where_any[] = $term; + } - $col_word = ( $case_sensitive ) ? 'word' : 'word_lcase'; - $where_any_str = ( count($where_any) > 0 ) ? '( ' . $col_word . ' LIKE \'%' . implode('%\' OR ' . $col_word . ' LIKE \'%', $where_any) . '%\' )' : ''; + $col_word = ( $case_sensitive ) ? 'word' : 'word_lcase'; + $where_any_str = ( count($where_any) > 0 ) ? '( ' . $col_word . ' LIKE \'%' . implode('%\' OR ' . $col_word . ' LIKE \'%', $where_any) . '%\' )' : ''; - // generate query - $sql = "SELECT word, page_names FROM " . table_prefix . "search_index WHERE {$where_any_str}"; - if ( !($q = $db->sql_query($sql)) ) - $db->_die('Error is in perform_search(), includes/search.php, query 1'); + // generate query + $sql = "SELECT word, page_names FROM " . table_prefix . "search_index WHERE {$where_any_str}"; + if ( !($q = $db->sql_query($sql)) ) + $db->_die('Error is in perform_search(), includes/search.php, query 1'); - $word_tracking = array(); - if ( $row = $db->fetchrow($q) ) - { - do - { - // get page list - $pages =& $row['page_names']; - - // Find page IDs that contain commas - // This should never happen because commas are escaped by sanitize_page_id(). Nevertheless for compatibility with older - // databases, and to alleviate the concerns of hackers, we'll accommodate for page IDs with commas here by checking for - // IDs that don't match the pattern for stringified page ID + namespace. If it doesn't match, that means it's a continuation - // of the previous ID and should be concatenated to the previous entry. - $matches = strpos($pages, ',') ? explode(',', $pages) : array($pages); - $prev = false; - foreach ( $matches as $i => $_ ) - { - $match =& $matches[$i]; - if ( !preg_match("/^ns=$ns_list;pid=(.+)$/", $match) && $prev ) - { - $matches[$prev] .= ',' . $match; - unset($match, $matches[$i]); - continue; - } - $prev = $i; - } - unset($match); + $word_tracking = array(); + if ( $row = $db->fetchrow($q) ) + { + do + { + // get page list + $pages =& $row['page_names']; + + // Find page IDs that contain commas + // This should never happen because commas are escaped by sanitize_page_id(). Nevertheless for compatibility with older + // databases, and to alleviate the concerns of hackers, we'll accommodate for page IDs with commas here by checking for + // IDs that don't match the pattern for stringified page ID + namespace. If it doesn't match, that means it's a continuation + // of the previous ID and should be concatenated to the previous entry. + $matches = strpos($pages, ',') ? explode(',', $pages) : array($pages); + $prev = false; + foreach ( $matches as $i => $_ ) + { + $match =& $matches[$i]; + if ( !preg_match("/^ns=$ns_list;pid=(.+)$/", $match) && $prev ) + { + $matches[$prev] .= ',' . $match; + unset($match, $matches[$i]); + continue; + } + $prev = $i; + } + unset($match); - // Iterate through each of the results, assigning scores based on how many times the page has shown up. - // This works because this phase of the search is strongly word-based not page-based. If a page shows up - // multiple times while fetching the result rows from the search_index table, it simply means that page - // contains more than one of the terms the user searched for. + // Iterate through each of the results, assigning scores based on how many times the page has shown up. + // This works because this phase of the search is strongly word-based not page-based. If a page shows up + // multiple times while fetching the result rows from the search_index table, it simply means that page + // contains more than one of the terms the user searched for. - foreach ( $matches as $match ) - { - $word_cs = (( $case_sensitive ) ? $row['word'] : strtolower($row['word'])); - if ( isset($word_tracking[$match]) && in_array($word_cs, $word_tracking[$match]) ) - { - continue; - } - if ( isset($word_tracking[$match]) ) - { - if ( isset($word_tracking[$match]) ) - { - $word_tracking[$match][] = $word_cs; - } - } - else - { - $word_tracking[$match] = array($word_cs); - } - - // echo '
' . print_r($word_tracking, true) . ''; - - $inc = 1; + foreach ( $matches as $match ) + { + $word_cs = (( $case_sensitive ) ? $row['word'] : strtolower($row['word'])); + if ( isset($word_tracking[$match]) && in_array($word_cs, $word_tracking[$match]) ) + { + continue; + } + if ( isset($word_tracking[$match]) ) + { + if ( isset($word_tracking[$match]) ) + { + $word_tracking[$match][] = $word_cs; + } + } + else + { + $word_tracking[$match] = array($word_cs); + } + + // echo '
' . print_r($word_tracking, true) . ''; + + $inc = 1; - // Is this search term present in the page's title? If so, give extra points - preg_match("/^ns=$ns_list;pid=(.+)$/", $match, $piecesparts); - $title = get_page_title_ns($piecesparts[2], $piecesparts[1]); - - $test_func = ( $case_sensitive ) ? 'strstr' : 'stristr'; - if ( $test_func($title, $row['word']) || $test_func($piecesparts[2], $row['word']) ) - { - $inc = 1.5; - } - - // increase points if 2 or more words match a phrase in the title - for ( $i = 0; $i < count($where_any) - 1; $i++ ) - { - $phrase = "{$where_any[$i]} {$where_any[$i + 1]}"; - if ( $test_func($title, $phrase) ) - { - $inc *= 1.25; - } - } - - // Deduct points if there are few similarities between the words - $lev_array = array(); - foreach ( $where_any as $qword ) - { - if ( strstr($word_cs, $qword) ) - $lev_array[ $qword ] = levenshtein($qword, $word_cs); - } - if ( min($lev_array) > 3 ) - { - $inc /= array_sum($lev_array) / count($lev_array); - } - - if ( isset($scores[$match]) ) - { - $scores[$match] = $scores[$match] + $inc; - } - else - { - $scores[$match] = $inc; - } - } - } - while ( $row = $db->fetchrow($q) ); - } - $db->free_result($q); - - // - // STAGE 2: FIRST ELIMINATION ROUND - // Iterate through the list of required terms. If a given page is not found to have the required term, eliminate it - // + // Is this search term present in the page's title? If so, give extra points + preg_match("/^ns=$ns_list;pid=(.+)$/", $match, $piecesparts); + $title = get_page_title_ns($piecesparts[2], $piecesparts[1]); + + $test_func = ( $case_sensitive ) ? 'strstr' : 'stristr'; + if ( $test_func($title, $row['word']) || $test_func($piecesparts[2], $row['word']) ) + { + $inc = 1.5; + } + + // increase points if 2 or more words match a phrase in the title + for ( $i = 0; $i < count($where_any) - 1; $i++ ) + { + $phrase = "{$where_any[$i]} {$where_any[$i + 1]}"; + if ( $test_func($title, $phrase) ) + { + $inc *= 1.25; + } + } + + // Deduct points if there are few similarities between the words + $lev_array = array(); + foreach ( $where_any as $qword ) + { + if ( strstr($word_cs, $qword) ) + $lev_array[ $qword ] = levenshtein($qword, $word_cs); + } + if ( min($lev_array) > 3 ) + { + $inc /= array_sum($lev_array) / count($lev_array); + } + + if ( isset($scores[$match]) ) + { + $scores[$match] = $scores[$match] + $inc; + } + else + { + $scores[$match] = $inc; + } + } + } + while ( $row = $db->fetchrow($q) ); + } + $db->free_result($q); + + // + // STAGE 2: FIRST ELIMINATION ROUND + // Iterate through the list of required terms. If a given page is not found to have the required term, eliminate it + // - foreach ( $query['req'] as $term ) - { - foreach ( $word_tracking as $i => $page ) - { - if ( !in_array($term, $page) ) - { - unset($word_tracking[$i], $scores[$i]); - } - } - } - } - - // - // STAGE 3: PHRASE SEARCHING - // Use LIKE to find pages with specified phrases. We can do a super-picky single query without another elimination round because - // at this stage we can search the full page_text column instead of relying on a word list. - // + foreach ( $query['req'] as $term ) + { + foreach ( $word_tracking as $i => $page ) + { + if ( !in_array($term, $page) ) + { + unset($word_tracking[$i], $scores[$i]); + } + } + } + } + + // + // STAGE 3: PHRASE SEARCHING + // Use LIKE to find pages with specified phrases. We can do a super-picky single query without another elimination round because + // at this stage we can search the full page_text column instead of relying on a word list. + // - // We can skip this stage if none of these special terms apply + // We can skip this stage if none of these special terms apply - $text_col = ( $case_sensitive ) ? 'page_text' : ENANO_SQLFUNC_LOWERCASE . '(page_text)'; - $name_col = ( $case_sensitive ) ? 'name' : ENANO_SQLFUNC_LOWERCASE . '(name)'; - $text_col_join = ( $case_sensitive ) ? 't.page_text' : ENANO_SQLFUNC_LOWERCASE . '(t.page_text)'; - $name_col_join = ( $case_sensitive ) ? 'p.name' : ENANO_SQLFUNC_LOWERCASE . '(p.name)'; - - $concat_column = ( ENANO_DBLAYER == 'MYSQL' ) ? - 'CONCAT(\'ns=\',t.namespace,\';pid=\',t.page_id)' : - "'ns=' || t.namespace || ';pid=' || t.page_id"; + $text_col = ( $case_sensitive ) ? 'page_text' : ENANO_SQLFUNC_LOWERCASE . '(page_text)'; + $name_col = ( $case_sensitive ) ? 'name' : ENANO_SQLFUNC_LOWERCASE . '(name)'; + $text_col_join = ( $case_sensitive ) ? 't.page_text' : ENANO_SQLFUNC_LOWERCASE . '(t.page_text)'; + $name_col_join = ( $case_sensitive ) ? 'p.name' : ENANO_SQLFUNC_LOWERCASE . '(p.name)'; + + $concat_column = ( ENANO_DBLAYER == 'MYSQL' ) ? + 'CONCAT(\'ns=\',t.namespace,\';pid=\',t.page_id)' : + "'ns=' || t.namespace || ';pid=' || t.page_id"; - if ( count($query_phrase['any']) > 0 || count($query_phrase['req']) > 0 ) - { + if ( count($query_phrase['any']) > 0 || count($query_phrase['req']) > 0 ) + { - $where_any = array(); - foreach ( $query_phrase['any'] as $term ) - { - $term = escape_string_like($term); - if ( !$case_sensitive ) - $term = strtolower($term); - $where_any[] = "( $text_col LIKE '%$term%' OR $name_col LIKE '%$term%' )"; - } + $where_any = array(); + foreach ( $query_phrase['any'] as $term ) + { + $term = escape_string_like($term); + if ( !$case_sensitive ) + $term = strtolower($term); + $where_any[] = "( $text_col LIKE '%$term%' OR $name_col LIKE '%$term%' )"; + } - $where_any = ( count($where_any) > 0 ) ? implode(" OR\n ", $where_any) : ''; + $where_any = ( count($where_any) > 0 ) ? implode(" OR\n ", $where_any) : ''; - // Also do required terms, but use AND to ensure that all required terms are included - $where_req = array(); - foreach ( $query_phrase['req'] as $term ) - { - $term = escape_string_like($term); - if ( !$case_sensitive ) - $term = strtolower($term); - $where_req[] = "( $text_col LIKE '%$term%' OR $name_col LIKE '%$term%' )"; - } - $and_clause = ( $where_any != '' ) ? 'AND ' : ''; - $where_req = ( count($where_req) > 0 ) ? "{$and_clause}" . implode(" AND\n ", $where_req) : ''; + // Also do required terms, but use AND to ensure that all required terms are included + $where_req = array(); + foreach ( $query_phrase['req'] as $term ) + { + $term = escape_string_like($term); + if ( !$case_sensitive ) + $term = strtolower($term); + $where_req[] = "( $text_col LIKE '%$term%' OR $name_col LIKE '%$term%' )"; + } + $and_clause = ( $where_any != '' ) ? 'AND ' : ''; + $where_req = ( count($where_req) > 0 ) ? "{$and_clause}" . implode(" AND\n ", $where_req) : ''; - $sql = 'SELECT ' . $concat_column . ' AS id, p.name, t.page_text FROM ' . table_prefix . "page_text AS t\n" - . " LEFT JOIN " . table_prefix . "pages AS p\n" - . " ON ( p.urlname = t.page_id AND p.namespace = t.namespace )\n" - . " WHERE p.visible = 1 AND (\n $where_any\n $where_req\n );"; - if ( !($q = $db->sql_query($sql)) ) - $db->_die('Error is in perform_search(), includes/search.php, query 2. Parsed query dump follows:
(indexable) ' . htmlspecialchars(print_r($query, true)) . '(non-indexable) ' . htmlspecialchars(print_r($query_phrase, true)) . ''); + $sql = 'SELECT ' . $concat_column . ' AS id, p.name, t.page_text FROM ' . table_prefix . "page_text AS t\n" + . " LEFT JOIN " . table_prefix . "pages AS p\n" + . " ON ( p.urlname = t.page_id AND p.namespace = t.namespace )\n" + . " WHERE p.visible = 1 AND (\n $where_any\n $where_req\n );"; + if ( !($q = $db->sql_query($sql)) ) + $db->_die('Error is in perform_search(), includes/search.php, query 2. Parsed query dump follows:
(indexable) ' . htmlspecialchars(print_r($query, true)) . '(non-indexable) ' . htmlspecialchars(print_r($query_phrase, true)) . ''); - if ( $row = $db->fetchrow() ) - { - do - { - $id =& $row['id']; - $inc = 0.0; + if ( $row = $db->fetchrow() ) + { + do + { + $id =& $row['id']; + $inc = 0.0; - $title = $row['name']; - $test_func = ( $case_sensitive ) ? 'strstr' : 'stristr'; - - // Is this search term present in the page's title? If so, give extra points - $word_list = array_merge($query_phrase['any'], $query_phrase['req']); - foreach ( $word_list as $word ) - { - if ( $test_func($title, $word) ) - $inc += 1.5; - else if ( $test_func($row['page_text'], $word) ) - $inc += 1.0; - } - - // increase points if 2 or more words match a phrase in the title - for ( $i = 0; $i < count($word_list) - 1; $i++ ) - { - $phrase = "{$word_list[$i]} {$word_list[$i + 1]}"; - if ( $test_func($title, $phrase) ) - $inc *= 1.25; - else if ( $test_func($row['page_text'], $phrase) ) - $inc *= 1.125; - } - - if ( isset($scores[$id]) ) - { - $scores[$id] = $scores[$id] + $inc; - } - else - { - $scores[$id] = $inc; - } - } - while ( $row = $db->fetchrow() ); - } - $db->free_result(); - } + $title = $row['name']; + $test_func = ( $case_sensitive ) ? 'strstr' : 'stristr'; + + // Is this search term present in the page's title? If so, give extra points + $word_list = array_merge($query_phrase['any'], $query_phrase['req']); + foreach ( $word_list as $word ) + { + if ( $test_func($title, $word) ) + $inc += 1.5; + else if ( $test_func($row['page_text'], $word) ) + $inc += 1.0; + } + + // increase points if 2 or more words match a phrase in the title + for ( $i = 0; $i < count($word_list) - 1; $i++ ) + { + $phrase = "{$word_list[$i]} {$word_list[$i + 1]}"; + if ( $test_func($title, $phrase) ) + $inc *= 1.25; + else if ( $test_func($row['page_text'], $phrase) ) + $inc *= 1.125; + } + + if ( isset($scores[$id]) ) + { + $scores[$id] = $scores[$id] + $inc; + } + else + { + $scores[$id] = $inc; + } + } + while ( $row = $db->fetchrow() ); + } + $db->free_result(); + } - // - // STAGE 4 - SELECT PAGE TEXT AND ELIMINATE NOTS - // At this point, we have a complete list of all the possible pages. Now we want to obtain the page text, and within the same query - // eliminate any terms that shouldn't be in there. - // + // + // STAGE 4 - SELECT PAGE TEXT AND ELIMINATE NOTS + // At this point, we have a complete list of all the possible pages. Now we want to obtain the page text, and within the same query + // eliminate any terms that shouldn't be in there. + // - // Generate master word list for the highlighter - $word_list = array_values(array_merge($query['any'], $query['req'], $query_phrase['any'], $query_phrase['req'])); + // Generate master word list for the highlighter + $word_list = array_values(array_merge($query['any'], $query['req'], $query_phrase['any'], $query_phrase['req'])); - $text_where = array(); - foreach ( $scores as $page_id => $_ ) - { - $text_where[] = $db->escape($page_id); - } - $text_where = '( ' . $concat_column . ' = \'' . implode('\' OR ' . $concat_column . ' = \'', $text_where) . '\' )'; + $text_where = array(); + foreach ( $scores as $page_id => $_ ) + { + $text_where[] = $db->escape($page_id); + } + $text_where = '( ' . $concat_column . ' = \'' . implode('\' OR ' . $concat_column . ' = \'', $text_where) . '\' )'; - if ( count($query['not']) > 0 ) - $text_where .= ' AND'; + if ( count($query['not']) > 0 ) + $text_where .= ' AND'; - $where_not = array(); - foreach ( $query['not'] as $term ) - { - $term = escape_string_like($term); - if ( !$case_sensitive ) - $term = strtolower($term); - $where_not[] = $term; - } - $where_not = ( count($where_not) > 0 ) ? "$text_col NOT LIKE '%" . implode("%' AND $text_col NOT LIKE '%", $where_not) . "%'" : ''; + $where_not = array(); + foreach ( $query['not'] as $term ) + { + $term = escape_string_like($term); + if ( !$case_sensitive ) + $term = strtolower($term); + $where_not[] = $term; + } + $where_not = ( count($where_not) > 0 ) ? "$text_col NOT LIKE '%" . implode("%' AND $text_col NOT LIKE '%", $where_not) . "%'" : ''; - $sql = 'SELECT ' . $concat_column . ' AS id, t.page_id, t.namespace, CHAR_LENGTH(t.page_text) AS page_length, t.page_text, p.name AS page_name FROM ' . table_prefix . "page_text AS t - LEFT JOIN " . table_prefix . "pages AS p - ON ( p.urlname = t.page_id AND p.namespace = t.namespace ) - WHERE p.visible = 1 AND ( $text_where $where_not );"; - if ( !($q = $db->sql_unbuffered_query($sql)) ) - $db->_die('Error is in perform_search(), includes/search.php, query 3'); + $sql = 'SELECT ' . $concat_column . ' AS id, t.page_id, t.namespace, CHAR_LENGTH(t.page_text) AS page_length, t.page_text, p.name AS page_name FROM ' . table_prefix . "page_text AS t + LEFT JOIN " . table_prefix . "pages AS p + ON ( p.urlname = t.page_id AND p.namespace = t.namespace ) + WHERE p.visible = 1 AND ( $text_where $where_not );"; + if ( !($q = $db->sql_unbuffered_query($sql)) ) + $db->_die('Error is in perform_search(), includes/search.php, query 3'); - $page_data = array(); - if ( $row = $db->fetchrow() ) - { - do - { - $row['page_text'] = htmlspecialchars($row['page_text']); - $row['page_name'] = htmlspecialchars($row['page_name']); + $page_data = array(); + if ( $row = $db->fetchrow() ) + { + do + { + $row['page_text'] = htmlspecialchars($row['page_text']); + $row['page_name'] = htmlspecialchars($row['page_name']); - // Highlight results (this is wonderfully automated) - $row['page_text'] = highlight_and_clip_search_result($row['page_text'], $word_list, $case_sensitive); - if ( strlen($row['page_text']) > 250 && !preg_match('/^\.\.\.(.+)\.\.\.$/', $row['page_text']) ) - { - $row['page_text'] = substr($row['page_text'], 0, 150) . '...'; - } - $row['page_name'] = highlight_search_result($row['page_name'], $word_list, $case_sensitive); + // Highlight results (this is wonderfully automated) + $row['page_text'] = highlight_and_clip_search_result($row['page_text'], $word_list, $case_sensitive); + if ( strlen($row['page_text']) > 250 && !preg_match('/^\.\.\.(.+)\.\.\.$/', $row['page_text']) ) + { + $row['page_text'] = substr($row['page_text'], 0, 150) . '...'; + } + $row['page_name'] = highlight_search_result($row['page_name'], $word_list, $case_sensitive); - $page_data[$row['id']] = $row; - } - while ( $row = $db->fetchrow() ); - } - $db->free_result(); - - // - // STAGE 5 - SPECIAL PAGE TITLE SEARCH - // Iterate through $paths->pages and check the titles for search terms. Score accordingly. - // + $page_data[$row['id']] = $row; + } + while ( $row = $db->fetchrow() ); + } + $db->free_result(); + + // + // STAGE 5 - SPECIAL PAGE TITLE SEARCH + // Iterate through $paths->pages and check the titles for search terms. Score accordingly. + // - foreach ( $paths->pages as $id => $page ) - { - if ( $page['namespace'] != 'Special' || $page['visible'] == 0 ) - continue; - $idstring = 'ns=' . $page['namespace'] . ';pid=' . $page['urlname_nons']; - $any = array_values(array_unique(array_merge($query['any'], $query_phrase['any']))); - foreach ( $any as $term ) - { - if ( $case_sensitive ) - { - if ( strstr($page['name'], $term) || strstr($page['urlname_nons'], $term) ) - { - ( isset($scores[$idstring]) ) ? $scores[$idstring] = $scores[$idstring] + 1.5 : $scores[$idstring] = 1.5; - } - } - else - { - if ( stristr($page['name'], $term) || stristr($page['urlname_nons'], $term) ) - { - ( isset($scores[$idstring]) ) ? $scores[$idstring] = $scores[$idstring] + 1.5 : $scores[$idstring] = 1.5; - } - } - } - if ( isset($scores[$idstring]) ) - { - $page_data[$idstring] = array( - 'page_name' => highlight_search_result($page['name'], $word_list, $case_sensitive), - 'page_text' => '', - 'page_id' => $page['urlname_nons'], - 'namespace' => $page['namespace'], - 'score' => $scores[$idstring], - 'page_length' => 1, - 'page_note' => '[' . $lang->get('search_result_tag_special') . ']' - ); - } - } - - // - // STAGE 6 - SECOND ELIMINATION ROUND - // Iterate through the list of required terms. If a given page is not found to have the required term, eliminate it - // + foreach ( $paths->pages as $id => $page ) + { + if ( $page['namespace'] != 'Special' || $page['visible'] == 0 ) + continue; + $idstring = 'ns=' . $page['namespace'] . ';pid=' . $page['urlname_nons']; + $any = array_values(array_unique(array_merge($query['any'], $query_phrase['any']))); + foreach ( $any as $term ) + { + if ( $case_sensitive ) + { + if ( strstr($page['name'], $term) || strstr($page['urlname_nons'], $term) ) + { + ( isset($scores[$idstring]) ) ? $scores[$idstring] = $scores[$idstring] + 1.5 : $scores[$idstring] = 1.5; + } + } + else + { + if ( stristr($page['name'], $term) || stristr($page['urlname_nons'], $term) ) + { + ( isset($scores[$idstring]) ) ? $scores[$idstring] = $scores[$idstring] + 1.5 : $scores[$idstring] = 1.5; + } + } + } + if ( isset($scores[$idstring]) ) + { + $page_data[$idstring] = array( + 'page_name' => highlight_search_result($page['name'], $word_list, $case_sensitive), + 'page_text' => '', + 'page_id' => $page['urlname_nons'], + 'namespace' => $page['namespace'], + 'score' => $scores[$idstring], + 'page_length' => 1, + 'page_note' => '[' . $lang->get('search_result_tag_special') . ']' + ); + } + } + + // + // STAGE 6 - SECOND ELIMINATION ROUND + // Iterate through the list of required terms. If a given page is not found to have the required term, eliminate it + // - $required = array_merge($query['req'], $query_phrase['req']); - foreach ( $required as $term ) - { - foreach ( $page_data as $id => $page ) - { - if ( ( $page['namespace'] == 'Special' || ( $page['namespace'] != 'Special' && !strstr($page['page_text'], $term) ) ) && !strstr($page['page_id'], $term) && !strstr($page['page_name'], $term) ) - { - unset($page_data[$id]); - } - } - } + $required = array_merge($query['req'], $query_phrase['req']); + foreach ( $required as $term ) + { + foreach ( $page_data as $id => $page ) + { + if ( ( $page['namespace'] == 'Special' || ( $page['namespace'] != 'Special' && !strstr($page['page_text'], $term) ) ) && !strstr($page['page_id'], $term) && !strstr($page['page_name'], $term) ) + { + unset($page_data[$id]); + } + } + } - // At this point, all of our normal results are in. However, we can also allow plugins to hook into the system and score their own - // pages and add text, etc. as necessary. - // Plugins are COMPLETELY responsible for using the search terms and handling Boolean logic properly + // At this point, all of our normal results are in. However, we can also allow plugins to hook into the system and score their own + // pages and add text, etc. as necessary. + // Plugins are COMPLETELY responsible for using the search terms and handling Boolean logic properly - inject_custom_search_results($query, $query_phrase, $scores, $page_data, $case_sensitive, $word_list); - - $code = $plugins->setHook('search_global_inner'); - foreach ( $code as $cmd ) - { - eval($cmd); - } + inject_custom_search_results($query, $query_phrase, $scores, $page_data, $case_sensitive, $word_list); + + $code = $plugins->setHook('search_global_inner'); + foreach ( $code as $cmd ) + { + eval($cmd); + } - // a marvelous debugging aid :-) - // die('
' . htmlspecialchars(print_r($page_data, true)) . ''); + // a marvelous debugging aid :-) + // die('
' . htmlspecialchars(print_r($page_data, true)) . ''); - // - // STAGE 7 - HIGHLIGHT, TRIM, AND SCORE RESULTS - // We now have the complete results of the search. We need to trim text down to show only portions of the page containing search - // terms, highlight any search terms within the page, and sort the final results array in descending order of score. - // + // + // STAGE 7 - HIGHLIGHT, TRIM, AND SCORE RESULTS + // We now have the complete results of the search. We need to trim text down to show only portions of the page containing search + // terms, highlight any search terms within the page, and sort the final results array in descending order of score. + // - // Sort scores array - arsort($scores); + // Sort scores array + arsort($scores); - // Divisor for calculating relevance scores - $divisor = ( count($query['any']) + count($query_phrase['any']) + count($query['req']) + count($query['not']) ) * 1.5; - $divisor = max($divisor, max($scores)); - - foreach ( $scores as $page_id => $score ) - { - if ( !isset($page_data[$page_id]) ) - // It's possible that $scores contains a score for a page that was later eliminated because it contained a disallowed term - continue; + // Divisor for calculating relevance scores + $divisor = ( count($query['any']) + count($query_phrase['any']) + count($query['req']) + count($query['not']) ) * 1.5; + $divisor = max($divisor, max($scores)); + + foreach ( $scores as $page_id => $score ) + { + if ( !isset($page_data[$page_id]) ) + // It's possible that $scores contains a score for a page that was later eliminated because it contained a disallowed term + continue; - // Make a copy of the datum, then delete the original (it frees up a LOT of RAM) - $datum = $page_data[$page_id]; - unset($page_data[$page_id]); + // Make a copy of the datum, then delete the original (it frees up a LOT of RAM) + $datum = $page_data[$page_id]; + unset($page_data[$page_id]); - // This is an internal value used for sorting - it's no longer needed. - unset($datum['id']); + // This is an internal value used for sorting - it's no longer needed. + unset($datum['id']); - // Calculate score - // if ( $score > $divisor ) - // $score = $divisor; - $datum['score'] = round($score / $divisor, 2) * 100; - - // Highlight the URL - $datum['url_highlight'] = makeUrlComplete($datum['namespace'], $datum['page_id']); - $datum['url_highlight'] = preg_replace('/\?.+$/', '', $datum['url_highlight']); - $datum['url_highlight'] = highlight_search_result($datum['url_highlight'], $word_list, $case_sensitive); + // Calculate score + // if ( $score > $divisor ) + // $score = $divisor; + $datum['score'] = round($score / $divisor, 2) * 100; + + // Highlight the URL + $datum['url_highlight'] = makeUrlComplete($datum['namespace'], $datum['page_id']); + $datum['url_highlight'] = preg_replace('/\?.+$/', '', $datum['url_highlight']); + $datum['url_highlight'] = highlight_search_result($datum['url_highlight'], $word_list, $case_sensitive); - // Store it in our until-now-unused results array - $results[] = $datum; - } + // Store it in our until-now-unused results array + $results[] = $datum; + } - // Our work here is done. :-D - return $results; + // Our work here is done. :-D + return $results; } /** @@ -594,166 +594,166 @@ function parse_search_query($query, &$warnings) { - global $lang; - - $stopwords = get_stopwords(); - $ret = array( - 'any' => array(), - 'req' => array(), - 'not' => array() - ); - $warnings = array(); - $terms = array(); - $in_quote = false; - $start_term = 0; - $just_finished = false; - for ( $i = 0; $i < strlen($query); $i++ ) - { - $chr = $query{$i}; - $prev = ( $i > 0 ) ? $query{ $i - 1 } : ''; - $next = ( ( $i + 1 ) < strlen($query) ) ? $query{ $i + 1 } : ''; + global $lang; + + $stopwords = get_stopwords(); + $ret = array( + 'any' => array(), + 'req' => array(), + 'not' => array() + ); + $warnings = array(); + $terms = array(); + $in_quote = false; + $start_term = 0; + $just_finished = false; + for ( $i = 0; $i < strlen($query); $i++ ) + { + $chr = $query{$i}; + $prev = ( $i > 0 ) ? $query{ $i - 1 } : ''; + $next = ( ( $i + 1 ) < strlen($query) ) ? $query{ $i + 1 } : ''; - if ( ( $chr == ' ' && !$in_quote ) || ( $i + 1 == strlen ( $query ) ) ) - { - $len = ( $next == '' ) ? $i + 1 : $i - $start_term; - $word = substr ( $query, $start_term, $len ); - $terms[] = $word; - $start_term = $i + 1; - } + if ( ( $chr == ' ' && !$in_quote ) || ( $i + 1 == strlen ( $query ) ) ) + { + $len = ( $next == '' ) ? $i + 1 : $i - $start_term; + $word = substr ( $query, $start_term, $len ); + $terms[] = $word; + $start_term = $i + 1; + } - elseif ( $chr == '"' && $in_quote && $prev != '\\' ) - { - $word = substr ( $query, $start_term, $i - $start_term + 1 ); - $start_pos = ( $next == ' ' ) ? $i + 2 : $i + 1; - $in_quote = false; - } + elseif ( $chr == '"' && $in_quote && $prev != '\\' ) + { + $word = substr ( $query, $start_term, $i - $start_term + 1 ); + $start_pos = ( $next == ' ' ) ? $i + 2 : $i + 1; + $in_quote = false; + } - elseif ( $chr == '"' && !$in_quote ) - { - $in_quote = true; - $start_pos = $i; - } + elseif ( $chr == '"' && !$in_quote ) + { + $in_quote = true; + $start_pos = $i; + } - } + } - $ticker = 0; + $ticker = 0; - foreach ( $terms as $element => $__unused ) - { - $atom =& $terms[$element]; + foreach ( $terms as $element => $__unused ) + { + $atom =& $terms[$element]; - $ticker++; + $ticker++; - if ( $ticker == 20 ) - { - $warnings[] = $lang->get('search_err_query_too_many_terms'); - break; - } + if ( $ticker == 20 ) + { + $warnings[] = $lang->get('search_err_query_too_many_terms'); + break; + } - if ( substr ( $atom, 0, 2 ) == '+"' && substr ( $atom, ( strlen ( $atom ) - 1 ), 1 ) == '"' ) - { - $word = substr ( $atom, 2, ( strlen( $atom ) - 3 ) ); - if ( strlen ( $word ) < 2 || in_array($word, $stopwords) ) - { - $warnings[] = $lang->get('search_err_query_has_stopwords'); - $ticker--; - continue; - } - if(in_array($word, $ret['req'])) - { - $warnings[] = $lang->get('search_err_query_dup_terms'); - $ticker--; - continue; - } - $ret['req'][] = $word; - } - elseif ( substr ( $atom, 0, 2 ) == '-"' && substr ( $atom, ( strlen ( $atom ) - 1 ), 1 ) == '"' ) - { - $word = substr ( $atom, 2, ( strlen( $atom ) - 3 ) ); - if ( strlen ( $word ) < 4 ) - { - $warnings[] = $lang->get('search_err_query_term_too_short'); - $ticker--; - continue; - } - if(in_array($word, $ret['not'])) - { - $warnings[] = $lang->get('search_err_query_dup_terms'); - $ticker--; - continue; - } - $ret['not'][] = $word; - } - elseif ( substr ( $atom, 0, 1 ) == '+' ) - { - $word = substr ( $atom, 1 ); - if ( strlen ( $word ) < 2 || in_array($word, $stopwords) ) - { - $warnings[] = $lang->get('search_err_query_has_stopwords'); - $ticker--; - continue; - } - if(in_array($word, $ret['req'])) - { - $warnings[] = $lang->get('search_err_query_dup_terms'); - $ticker--; - continue; - } - $ret['req'][] = $word; - } - elseif ( substr ( $atom, 0, 1 ) == '-' ) - { - $word = substr ( $atom, 1 ); - if ( strlen ( $word ) < 2 || in_array($word, $stopwords) ) - { - $warnings[] = $lang->get('search_err_query_has_stopwords'); - $ticker--; - continue; - } - if(in_array($word, $ret['not'])) - { - $warnings[] = $lang->get('search_err_query_dup_terms'); - $ticker--; - continue; - } - $ret['not'][] = $word; - } - elseif ( substr ( $atom, 0, 1 ) == '"' && substr ( $atom, ( strlen($atom) - 1 ), 1 ) == '"' ) - { - $word = substr ( $atom, 1, ( strlen ( $atom ) - 2 ) ); - if ( strlen ( $word ) < 2 || in_array($word, $stopwords) ) - { - $warnings[] = $lang->get('search_err_query_has_stopwords'); - $ticker--; - continue; - } - if(in_array($word, $ret['any'])) - { - $warnings[] = $lang->get('search_err_query_dup_terms'); - $ticker--; - continue; - } - $ret['any'][] = $word; - } - else - { - $word = $atom; - if ( strlen ( $word ) < 2 || in_array($word, $stopwords) ) - { - $warnings[] = $lang->get('search_err_query_has_stopwords'); - $ticker--; - continue; - } - if(in_array($word, $ret['any'])) - { - $warnings[] = $lang->get('search_err_query_dup_terms'); - $ticker--; - continue; - } - $ret['any'][] = $word; - } - } - return $ret; + if ( substr ( $atom, 0, 2 ) == '+"' && substr ( $atom, ( strlen ( $atom ) - 1 ), 1 ) == '"' ) + { + $word = substr ( $atom, 2, ( strlen( $atom ) - 3 ) ); + if ( strlen ( $word ) < 2 || in_array($word, $stopwords) ) + { + $warnings[] = $lang->get('search_err_query_has_stopwords'); + $ticker--; + continue; + } + if(in_array($word, $ret['req'])) + { + $warnings[] = $lang->get('search_err_query_dup_terms'); + $ticker--; + continue; + } + $ret['req'][] = $word; + } + elseif ( substr ( $atom, 0, 2 ) == '-"' && substr ( $atom, ( strlen ( $atom ) - 1 ), 1 ) == '"' ) + { + $word = substr ( $atom, 2, ( strlen( $atom ) - 3 ) ); + if ( strlen ( $word ) < 4 ) + { + $warnings[] = $lang->get('search_err_query_term_too_short'); + $ticker--; + continue; + } + if(in_array($word, $ret['not'])) + { + $warnings[] = $lang->get('search_err_query_dup_terms'); + $ticker--; + continue; + } + $ret['not'][] = $word; + } + elseif ( substr ( $atom, 0, 1 ) == '+' ) + { + $word = substr ( $atom, 1 ); + if ( strlen ( $word ) < 2 || in_array($word, $stopwords) ) + { + $warnings[] = $lang->get('search_err_query_has_stopwords'); + $ticker--; + continue; + } + if(in_array($word, $ret['req'])) + { + $warnings[] = $lang->get('search_err_query_dup_terms'); + $ticker--; + continue; + } + $ret['req'][] = $word; + } + elseif ( substr ( $atom, 0, 1 ) == '-' ) + { + $word = substr ( $atom, 1 ); + if ( strlen ( $word ) < 2 || in_array($word, $stopwords) ) + { + $warnings[] = $lang->get('search_err_query_has_stopwords'); + $ticker--; + continue; + } + if(in_array($word, $ret['not'])) + { + $warnings[] = $lang->get('search_err_query_dup_terms'); + $ticker--; + continue; + } + $ret['not'][] = $word; + } + elseif ( substr ( $atom, 0, 1 ) == '"' && substr ( $atom, ( strlen($atom) - 1 ), 1 ) == '"' ) + { + $word = substr ( $atom, 1, ( strlen ( $atom ) - 2 ) ); + if ( strlen ( $word ) < 2 || in_array($word, $stopwords) ) + { + $warnings[] = $lang->get('search_err_query_has_stopwords'); + $ticker--; + continue; + } + if(in_array($word, $ret['any'])) + { + $warnings[] = $lang->get('search_err_query_dup_terms'); + $ticker--; + continue; + } + $ret['any'][] = $word; + } + else + { + $word = $atom; + if ( strlen ( $word ) < 2 || in_array($word, $stopwords) ) + { + $warnings[] = $lang->get('search_err_query_has_stopwords'); + $ticker--; + continue; + } + if(in_array($word, $ret['any'])) + { + $warnings[] = $lang->get('search_err_query_dup_terms'); + $ticker--; + continue; + } + $ret['any'][] = $word; + } + } + return $ret; } /** @@ -764,10 +764,10 @@ function escape_string_like($string) { - global $db, $session, $paths, $template, $plugins; // Common objects - $string = $db->escape($string); - $string = str_replace(array('%', '_'), array('\%', '\_'), $string); - return $string; + global $db, $session, $paths, $template, $plugins; // Common objects + $string = $db->escape($string); + $string = str_replace(array('%', '_'), array('\%', '\_'), $string); + return $string; } /** @@ -780,18 +780,18 @@ function highlight_search_result($pt, $words, $case_sensitive = false) { - $words2 = array(); - for ( $i = 0; $i < sizeof($words); $i++) - { - if(!empty($words[$i])) - $words2[] = preg_quote($words[$i]); - } + $words2 = array(); + for ( $i = 0; $i < sizeof($words); $i++) + { + if(!empty($words[$i])) + $words2[] = preg_quote($words[$i]); + } - $flag = ( $case_sensitive ) ? '' : 'i'; - $regex = '/(' . implode('|', str_replace('/', '\\/', $words2)) . ')/' . $flag; - $pt = preg_replace($regex, '