721 ON ( t.page_id=p.urlname AND t.namespace=p.namespace ) |
721 ON ( t.page_id=p.urlname AND t.namespace=p.namespace ) |
722 WHERE p.namespace=t.namespace |
722 WHERE p.namespace=t.namespace |
723 AND ( p.password=\'\' OR p.password=\'da39a3ee5e6b4b0d3255bfef95601890afd80709\' ) |
723 AND ( p.password=\'\' OR p.password=\'da39a3ee5e6b4b0d3255bfef95601890afd80709\' ) |
724 AND p.visible=1;'; // Only indexes "visible" pages |
724 AND p.visible=1;'; // Only indexes "visible" pages |
725 return $texts; |
725 return $texts; |
|
726 } |
|
727 |
|
728 /** |
|
729 * Get the unique words on a page. Returns an array listing all items in small array $arr1 that are not in very large array $arr2. |
|
730 * @param array |
|
731 * @param array |
|
732 * @return array |
|
733 */ |
|
734 |
|
735 function get_unique_words($arr1, $arr2) |
|
736 { |
|
737 $no = array(); |
|
738 foreach ( $arr2 as $w ) |
|
739 { |
|
740 if ( ($k = array_search($w, $arr1, true)) !== false ) |
|
741 { |
|
742 $no[$k] = true; |
|
743 } |
|
744 } |
|
745 $ret = array(); |
|
746 foreach ( $arr1 as $k => $w ) |
|
747 { |
|
748 if ( !isset($no[$k]) ) |
|
749 $ret[] = $w; |
|
750 } |
|
751 return $ret; |
726 } |
752 } |
727 |
753 |
728 /** |
754 /** |
729 * Builds a word list for search indexing. |
755 * Builds a word list for search indexing. |
730 * @param string Text to index |
756 * @param string Text to index |
861 // Indexing identifier for the page in the DB |
887 // Indexing identifier for the page in the DB |
862 $page_uniqid = "ns={$row['namespace']};pid=" . sanitize_page_id($row['page_id']); |
888 $page_uniqid = "ns={$row['namespace']};pid=" . sanitize_page_id($row['page_id']); |
863 $page_uniqid = $db->escape($page_uniqid); |
889 $page_uniqid = $db->escape($page_uniqid); |
864 |
890 |
865 // List of words on the page |
891 // List of words on the page |
|
892 if ( $debug ) |
|
893 echo "wordlist..."; |
866 $wordlist = $this->calculate_word_list($row['page_text'], $row['page_id'], $row['name']); |
894 $wordlist = $this->calculate_word_list($row['page_text'], $row['page_id'], $row['name']); |
867 |
895 |
868 // Index calculation complete -- run inserts |
896 // Index calculation complete -- run inserts |
869 $inserts = array(); |
897 $inserts = array(); |
|
898 $qt = array(); |
|
899 $unique_words = $this->get_unique_words($wordlist, $master_word_list); |
870 foreach ( $wordlist as $word ) |
900 foreach ( $wordlist as $word ) |
871 { |
901 { |
|
902 $qs = microtime_float(); |
872 if ( in_array($word, $stopwords) || strval(intval($word)) === $word || strlen($word) < 3 ) |
903 if ( in_array($word, $stopwords) || strval(intval($word)) === $word || strlen($word) < 3 ) |
873 continue; |
904 continue; |
874 $word_db = $db->escape($word); |
905 $word_db = $db->escape($word); |
875 $word_db_lc = $db->escape(strtolower($word)); |
906 $word_db_lc = $db->escape(strtolower($word)); |
876 if ( !in_array($word, $master_word_list) ) |
907 if ( in_array($word, $unique_words) ) |
877 { |
908 { |
878 $inserts[] = "( '$word_db', '$word_db_lc', '$page_uniqid' )"; |
909 $inserts[] = "( '$word_db', '$word_db_lc', '$page_uniqid' )"; |
879 } |
910 } |
880 else |
911 else |
881 { |
912 { |
886 "page_names || ',$page_uniqid'"; |
917 "page_names || ',$page_uniqid'"; |
887 $q = $db->sql_query('UPDATE ' . table_prefix . "search_index SET page_names = $pid_col WHERE word = '$word_db';", false); |
918 $q = $db->sql_query('UPDATE ' . table_prefix . "search_index SET page_names = $pid_col WHERE word = '$word_db';", false); |
888 if ( !$q ) |
919 if ( !$q ) |
889 $db->_die(); |
920 $db->_die(); |
890 } |
921 } |
|
922 $qt[] = microtime_float() - $qs; |
891 } |
923 } |
|
924 if ( $debug && count($qt) > 0 ) |
|
925 echo "QT: " . number_format(array_sum($qt) / count($qt), 4) . " * " . count($qt) . '; wl_len: ' . count($master_word_list) .' '; |
892 if ( count($inserts) > 0 ) |
926 if ( count($inserts) > 0 ) |
893 { |
927 { |
894 if ( $verbose && $debug ) |
928 if ( $verbose && $debug ) |
895 echo 'i'; |
929 echo 'i'; |
896 $inserts = implode(",\n ", $inserts); |
930 $inserts = implode(",\n ", $inserts); |
897 $q = $db->sql_query('INSERT INTO ' . table_prefix . "search_index(word, word_lcase, page_names) VALUES\n $inserts;", false); |
931 $q = $db->sql_query('INSERT INTO ' . table_prefix . "search_index(word, word_lcase, page_names) VALUES\n $inserts;", false); |
898 if ( !$q ) |
932 if ( !$q ) |
899 $db->_die(); |
933 $db->_die(); |
900 } |
934 } |
901 |
935 |
902 $master_word_list = array_unique(array_merge($master_word_list, $wordlist)); |
936 $master_word_list = array_merge($master_word_list, $unique_words); |
903 if ( $verbose ) |
937 if ( $verbose ) |
904 { |
938 { |
905 if ( isset($_SERVER['REQUEST_URI']) ) |
939 if ( isset($_SERVER['REQUEST_URI']) ) |
906 echo '<br />'; |
940 echo '<br />'; |
907 echo "\n"; |
941 echo "\n"; |
908 } |
942 } |
909 unset($inserts, $wordlist, $page_uniqid, $word_db, $q, $word, $row); |
943 unset($inserts, $wordlist, $page_uniqid, $word_db, $q, $word, $row, $unique_words); |
910 } |
944 } |
911 while ( $row = $db->fetchrow($texts) ); |
945 while ( $row = $db->fetchrow($texts) ); |
912 } |
946 } |
913 $db->free_result($texts); |
947 $db->free_result($texts); |
914 } |
948 } |