581 AND p.visible=1;'; // Only indexes "visible" pages |
581 AND p.visible=1;'; // Only indexes "visible" pages |
582 return $texts; |
582 return $texts; |
583 } |
583 } |
584 |
584 |
585 /** |
585 /** |
586 * Rebuilds the search index |
586 * Builds a word list for search indexing. |
587 * @param bool If true, prints out status messages |
587 * @param string Text to index |
|
588 * @param string Page ID of the page being indexed |
|
589 * @param string Title of the page being indexed |
|
590 * @return array List of words |
588 */ |
591 */ |
589 |
592 |
590 function rebuild_search_index($verbose = false) |
593 function calculate_word_list($text, $page_id, $page_name) |
|
594 { |
|
595 $page_id = dirtify_page_id($page_id); |
|
596 $text = preg_replace('/[^a-z0-9\']/i', ' ', $text); |
|
597 $page_id = preg_replace('/[^a-z0-9\']/i', ' ', $page_id); |
|
598 $page_name = preg_replace('/[^a-z0-9\']/i', ' ', $page_name); |
|
599 $text .= " $page_id $page_name"; |
|
600 $text = explode(' ', $text); |
|
601 foreach ( $text as $i => &$word ) |
|
602 { |
|
603 if ( strstr($word, "''") ) |
|
604 $word = preg_replace("/[']{2,}/", '', $word); |
|
605 if ( strlen($word) < 2 ) |
|
606 unset($text[$i]); |
|
607 } |
|
608 $text = array_unique(array_values($text)); |
|
609 return $text; |
|
610 } |
|
611 |
|
612 /** |
|
613 * Rebuilds the site's entire search index. Considerably more exciting if run from the command line. |
|
614 * @param bool If true, verbose output. |
|
615 * @param bool If true, verbose + debugging output. |
|
616 */ |
|
617 |
|
618 function rebuild_search_index($verbose = false, $debug = false) |
591 { |
619 { |
592 global $db, $session, $paths, $template, $plugins; // Common objects |
620 global $db, $session, $paths, $template, $plugins; // Common objects |
593 $search = new Searcher(); |
621 |
|
622 @set_time_limit(0); |
|
623 |
|
624 $q = $db->sql_query('DELETE FROM search_index;'); |
|
625 if ( !$q ) |
|
626 $db->_die(); |
|
627 |
|
628 $sha1_blank = sha1(''); |
|
629 $query_func = ( ENANO_DBLAYER == 'MYSQL' ) ? 'mysql_query' : 'pg_query'; |
|
630 |
|
631 // |
|
632 // Index $pages_in_batch pages at a time |
|
633 // |
|
634 $pages_in_batch = 15; |
|
635 |
|
636 // First find out how many pages there are |
|
637 $q = $db->sql_query('SELECT COUNT(p.urlname) AS num_pages FROM ' . table_prefix . "page_text AS t\n" |
|
638 . " LEFT JOIN " . table_prefix . "pages AS p\n" |
|
639 . " ON ( p.urlname = t.page_id AND p.namespace = t.namespace )\n" |
|
640 . " WHERE ( p.password = '' OR p.password = '$sha1_blank' )\n" |
|
641 . " AND ( p.visible = 1 );"); |
|
642 if ( !$q ) |
|
643 $db->_die(); |
|
644 |
|
645 list($num_pages) = $db->fetchrow_num(); |
|
646 $num_pages = intval($num_pages); |
|
647 $loops = ceil($num_pages / $pages_in_batch); |
|
648 $master_word_list = array(); |
|
649 $stopwords = get_stopwords(); |
|
650 |
|
651 for ( $j = 0; $j < $loops; ) |
|
652 { |
|
653 $offset = $j * $pages_in_batch; |
|
654 |
|
655 $j++; |
|
656 |
|
657 if ( $verbose && $debug ) |
|
658 { |
|
659 echo "Running indexing round $j of $loops (offset $offset)\n" . ( isset($_SERVER['REQUEST_URI']) ? '<br />' : '' ); |
|
660 } |
|
661 |
|
662 $texts = $db->sql_query('SELECT p.name, t.page_id, t.namespace, t.page_text FROM ' . table_prefix . "page_text AS t\n" |
|
663 . " LEFT JOIN " . table_prefix . "pages AS p\n" |
|
664 . " ON ( p.urlname = t.page_id AND p.namespace = t.namespace )\n" |
|
665 . " WHERE ( p.password = '' OR p.password = '$sha1_blank' )\n" |
|
666 . " AND ( p.visible = 1 )\n" |
|
667 . " LIMIT $offset, $pages_in_batch;", false); |
|
668 if ( !$texts ) |
|
669 $db->_die(); |
|
670 |
|
671 $k = $offset; |
|
672 |
|
673 if ( $row = $db->fetchrow($texts) ) |
|
674 { |
|
675 do |
|
676 { |
|
677 $k++; |
|
678 if ( $verbose ) |
|
679 { |
|
680 $mu = memory_get_usage(); |
|
681 echo " Indexing page $k of $num_pages: {$row['namespace']}:{$row['page_id']}"; |
|
682 if ( $debug ) |
|
683 echo ", mem = $mu..."; |
|
684 flush(); |
|
685 } |
|
686 |
|
687 // Indexing identifier for the page in the DB |
|
688 $page_uniqid = "ns={$row['namespace']};pid=" . sanitize_page_id($row['page_id']); |
|
689 $page_uniqid = $db->escape($page_uniqid); |
|
690 |
|
691 // List of words on the page |
|
692 $wordlist = $this->calculate_word_list($row['page_text'], $row['page_id'], $row['name']); |
|
693 |
|
694 // Index calculation complete -- run inserts |
|
695 $inserts = array(); |
|
696 foreach ( $wordlist as $word ) |
|
697 { |
|
698 if ( in_array($word, $stopwords) || strval(intval($word)) === $word || strlen($word) < 3 ) |
|
699 continue; |
|
700 $word_db = $db->escape($word); |
|
701 if ( !in_array($word, $master_word_list) ) |
|
702 { |
|
703 $inserts[] = "( '$word_db', '$page_uniqid' )"; |
|
704 } |
|
705 else |
|
706 { |
|
707 if ( $verbose && $debug ) |
|
708 echo '.'; |
|
709 $pid_col = ( ENANO_DBLAYER == 'MYSQL' ) ? |
|
710 "CONCAT( page_names, ',$page_uniqid' )": |
|
711 "page_names || ',$page_uniqid'"; |
|
712 $q = $db->sql_query('UPDATE ' . table_prefix . "search_index SET page_names = $pid_col WHERE word = '$word_db';", false); |
|
713 if ( !$q ) |
|
714 $db->_die(); |
|
715 } |
|
716 } |
|
717 if ( count($inserts) > 0 ) |
|
718 { |
|
719 if ( $verbose && $debug ) |
|
720 echo 'i'; |
|
721 $inserts = implode(",\n ", $inserts); |
|
722 $q = $db->sql_query('INSERT INTO ' . table_prefix . "search_index(word, page_names) VALUES\n $inserts;", false); |
|
723 if ( !$q ) |
|
724 $db->_die(); |
|
725 } |
|
726 |
|
727 $master_word_list = array_unique(array_merge($master_word_list, $wordlist)); |
|
728 if ( $verbose ) |
|
729 { |
|
730 if ( isset($_SERVER['REQUEST_URI']) ) |
|
731 echo '<br />'; |
|
732 echo "\n"; |
|
733 } |
|
734 unset($inserts, $wordlist, $page_uniqid, $word_db, $q, $word, $row); |
|
735 } |
|
736 while ( $row = $db->fetchrow($texts) ); |
|
737 } |
|
738 $db->free_result($texts); |
|
739 } |
594 if ( $verbose ) |
740 if ( $verbose ) |
595 { |
741 { |
596 echo '<p>'; |
742 echo "Indexing complete."; |
597 } |
743 if ( isset($_SERVER['REQUEST_URI']) ) |
598 $texts = Array(); |
744 echo '<br />'; |
599 $textq = $db->sql_unbuffered_query($this->fetch_page_search_resource()); |
745 echo "\n"; |
600 if(!$textq) $db->_die(''); |
746 } |
601 while($row = $db->fetchrow()) |
747 return true; |
602 { |
|
603 if ( $verbose ) |
|
604 { |
|
605 ob_start(); |
|
606 echo "Indexing page " . $this->nslist[$row['namespace']] . sanitize_page_id($row['page_id']) . "<br />"; |
|
607 ob_flush(); |
|
608 while (@ob_end_flush()); |
|
609 flush(); |
|
610 } |
|
611 if ( isset($this->nslist[$row['namespace']]) ) |
|
612 { |
|
613 $idstring = $this->nslist[$row['namespace']] . sanitize_page_id($row['page_id']); |
|
614 if ( isset($this->pages[$idstring]) ) |
|
615 { |
|
616 $page = $this->pages[$idstring]; |
|
617 } |
|
618 else |
|
619 { |
|
620 $page = array('name' => dirtify_page_id($row['page_id'])); |
|
621 } |
|
622 } |
|
623 else |
|
624 { |
|
625 $page = array('name' => dirtify_page_id($row['page_id'])); |
|
626 } |
|
627 $texts[(string)$row['page_idstring']] = $row['page_text'] . ' ' . $page['name']; |
|
628 } |
|
629 if ( $verbose ) |
|
630 { |
|
631 ob_start(); |
|
632 echo "Calculating word list..."; |
|
633 ob_flush(); |
|
634 while (@ob_end_flush()); |
|
635 flush(); |
|
636 } |
|
637 $search->buildIndex($texts); |
|
638 if ( $verbose ) |
|
639 { |
|
640 echo '</p>'; |
|
641 } |
|
642 // echo '<pre>'.print_r($search->index, true).'</pre>'; |
|
643 // return; |
|
644 $q = $db->sql_query('DELETE FROM '.table_prefix.'search_index'); |
|
645 if(!$q) return false; |
|
646 $secs = Array(); |
|
647 $q = 'INSERT INTO '.table_prefix.'search_index(word,page_names) VALUES'; |
|
648 foreach($search->index as $word => $pages) |
|
649 { |
|
650 $secs[] = '(\''.$db->escape($word).'\', \''.$db->escape($pages).'\')'; |
|
651 } |
|
652 $q .= implode(',', $secs); |
|
653 unset($secs); |
|
654 $q .= ';'; |
|
655 $result = $db->sql_query($q); |
|
656 $db->free_result(); |
|
657 if($result) |
|
658 return true; |
|
659 else |
|
660 $db->_die('The search index was trying to rebuild itself when the error occured.'); |
|
661 } |
748 } |
662 |
749 |
663 /** |
750 /** |
664 * Partially rebuilds the search index, removing/inserting entries only for the current page |
751 * Partially rebuilds the search index, removing/inserting entries only for the current page |
665 * @param string $page_id |
752 * @param string $page_id |