# HG changeset patch
# User Dan
# Date 1196031183 18000
# Node ID b3cfaf0a505c4ac1ea7c1155daec386f6b7f8955
# Parent a1d0846c4504ee917e6c3d350d875212d2d2f910
Fixed highlighting in search results; changed search algorithm to give more score for terms found in page title; hopefully (hackishly) fixed login_key_cache getting too long
diff -r a1d0846c4504 -r b3cfaf0a505c includes/paths.php
--- a/includes/paths.php Sat Nov 24 13:16:20 2007 -0500
+++ b/includes/paths.php Sun Nov 25 17:53:03 2007 -0500
@@ -677,17 +677,30 @@
/**
* Rebuilds the search index
+ * @param bool If true, prints out status messages
*/
- function rebuild_search_index()
+ function rebuild_search_index($verbose = false)
{
global $db, $session, $paths, $template, $plugins; // Common objects
$search = new Searcher();
+ if ( $verbose )
+ {
+ echo '
';
// return;
$q = $db->sql_query('DELETE FROM '.table_prefix.'search_index');
diff -r a1d0846c4504 -r b3cfaf0a505c includes/search.php
--- a/includes/search.php Sat Nov 24 13:16:20 2007 -0500
+++ b/includes/search.php Sun Nov 25 17:53:03 2007 -0500
@@ -19,7 +19,7 @@
* @param array $arr2
* @return array
*/
-
+
function enano_safe_array_merge($arr1, $arr2)
{
$arr3 = $arr1;
@@ -34,7 +34,7 @@
* In Enano versions prior to 1.0.2, this class provided a search function that was keyword-based and allowed boolean searches. It was
* cut from Coblynau and replaced with perform_search(), later in this file, because of speed issues. Now mostly deprecated. The only
* thing remaining is the buildIndex function, which is still used by the path manager and the new search framework.
- *
+ *
* @package Enano
* @subpackage Page management frontend
* @license GNU General Public License
@@ -42,17 +42,17 @@
class Searcher
{
-
+
var $results;
var $index;
var $warnings;
var $match_case = false;
-
+
function buildIndex($texts)
{
$this->index = Array();
$stopwords = get_stopwords();
-
+
foreach($texts as $i => $l)
{
$seed = md5(microtime(true) . mt_rand());
@@ -119,15 +119,15 @@
{
global $db, $session, $paths, $template, $plugins; // Common objects
$warnings = array();
-
+
$query = parse_search_query($query, $warnings);
-
+
// Segregate search terms containing spaces
$query_phrase = array(
'any' => array(),
'req' => array()
);
-
+
foreach ( $query['any'] as $i => $_ )
{
$term =& $query['any'][$i];
@@ -141,7 +141,7 @@
}
unset($term);
$query['any'] = array_values($query['any']);
-
+
foreach ( $query['req'] as $i => $_ )
{
$term =& $query['req'][$i];
@@ -154,12 +154,13 @@
}
unset($term);
$query['req'] = array_values($query['req']);
-
+
$results = array();
$scores = array();
-
+ $ns_list = '(' . implode('|', array_keys($paths->nslist)) . ')';
+
// FIXME: Update to use FULLTEXT algo when available.
-
+
// Build an SQL query to load from the index table
if ( count($query['any']) < 1 && count($query['req']) < 1 && count($query_phrase['any']) < 1 && count($query_phrase['req']) < 1 )
{
@@ -167,14 +168,14 @@
$warnings[] = 'You need to have at least one keyword in your search query. Searching only for pages not containing a term is not allowed.';
return array();
}
-
+
//
// STAGE 1
// Get all possible result pages from the search index. Tally which pages have the most words, and later sort them by boolean relevance
//
-
+
// Skip this if no indexable words are included
-
+
if ( count($query['any']) > 0 || count($query['req']) > 0 )
{
$where_any = array();
@@ -192,18 +193,18 @@
$term = strtolower($term);
$where_any[] = $term;
}
-
+
$col_word = ( $case_sensitive ) ? 'word' : 'lcase(word)';
$where_any = ( count($where_any) > 0 ) ? '( ' . $col_word . ' = \'' . implode('\' OR ' . $col_word . ' = \'', $where_any) . '\' )' : '';
-
+
// generate query
// using a GROUP BY here ensures that the same word with a different case isn't counted as 2 words - it's all melted back
// into one later in the processing stages
- $group_by = ( $case_sensitive ) ? '' : ' GROUP BY lcase(word);';
- $sql = "SELECT word, page_names FROM " . table_prefix . "search_index WHERE {$where_any}{$group_by}";
+ // $group_by = ( $case_sensitive ) ? '' : ' GROUP BY lcase(word);';
+ $sql = "SELECT word, page_names FROM " . table_prefix . "search_index WHERE {$where_any}";
if ( !($q = $db->sql_unbuffered_query($sql)) )
$db->_die('Error is in perform_search(), includes/search.php, query 1');
-
+
$word_tracking = array();
if ( $row = $db->fetchrow() )
{
@@ -211,11 +212,10 @@
{
// get page list
$pages =& $row['page_names'];
- $ns_list = '(' . implode('|', array_keys($paths->nslist)) . ')';
if ( strpos($pages, ',') )
{
// the term occurs in more than one page
-
+
// Find page IDs that contain commas
// This should never happen because commas are escaped by sanitize_page_id(). Nevertheless for compatibility with older
// databases, and to alleviate the concerns of hackers, we'll accommodate for page IDs with commas here by checking for
@@ -235,62 +235,104 @@
$prev = $i;
}
unset($match);
-
+
// Iterate through each of the results, assigning scores based on how many times the page has shown up.
// This works because this phase of the search is strongly word-based not page-based. If a page shows up
// multiple times while fetching the result rows from the search_index table, it simply means that page
// contains more than one of the terms the user searched for.
-
+
foreach ( $matches as $match )
{
- if ( isset($scores[$match]) )
+ $word_cs = (( $case_sensitive ) ? $row['word'] : strtolower($row['word']));
+ if ( isset($word_tracking[$match]) && in_array($word_cs, $word_tracking[$match]) )
{
- $scores[$match]++;
+ continue;
+ }
+ if ( isset($word_tracking[$match]) )
+ {
+ if ( isset($word_tracking[$match]) )
+ {
+ $word_tracking[$match][] = ($word_cs);
+ }
}
else
{
- $scores[$match] = 1;
+ $word_tracking[$match] = array($word_cs);
}
- if ( isset($word_tracking[$match]) )
+ $inc = 1;
+
+ // Is this search term present in the page's title? If so, give extra points
+ preg_match("/^ns=$ns_list;pid=(.+)$/", $match, $piecesparts);
+ $pathskey = $paths->nslist[ $piecesparts[1] ] . sanitize_page_id($piecesparts[2]);
+ if ( isset($paths->pages[$pathskey]) )
{
- $word_tracking[$match][] = $row['word'];
+ $test_func = ( $case_sensitive ) ? 'strstr' : 'stristr';
+ if ( $test_func($paths->pages[$pathskey]['name'], $row['word']) || $test_func($paths->pages[$pathskey]['urlname_nons'], $row['word']) )
+ {
+ $inc = 1.5;
+ }
+ }
+ if ( isset($scores[$match]) )
+ {
+ $scores[$match] = $scores[$match] + $inc;
}
else
{
- $word_tracking[$match] = array($row['word']);
+ $scores[$match] = $inc;
}
}
}
else
{
// the term only occurs in one page
- if ( isset($scores[$pages]) )
+ $word_cs = (( $case_sensitive ) ? $row['word'] : strtolower($row['word']));
+ if ( isset($word_tracking[$pages]) && in_array($word_cs, $word_tracking[$pages]) )
{
- $scores[$pages]++;
+ continue;
+ }
+ if ( isset($word_tracking[$pages]) )
+ {
+ if ( isset($word_tracking[$pages]) )
+ {
+ $word_tracking[$pages][] = ($word_cs);
+ }
}
else
{
- $scores[$pages] = 1;
+ $word_tracking[$pages] = array($word_cs);
}
- if ( isset($word_tracking[$pages]) )
+ $inc = 1;
+
+ // Is this search term present in the page's title? If so, give extra points
+ preg_match("/^ns=$ns_list;pid=(.+)$/", $pages, $piecesparts);
+ $pathskey = $paths->nslist[ $piecesparts[1] ] . sanitize_page_id($piecesparts[2]);
+ if ( isset($paths->pages[$pathskey]) )
{
- $word_tracking[$pages][] = $row['word'];
+ $test_func = ( $case_sensitive ) ? 'strstr' : 'stristr';
+ if ( $test_func($paths->pages[$pathskey]['name'], $row['word']) || $test_func($paths->pages[$pathskey]['urlname_nons'], $row['word']) )
+ {
+ $inc = 1.5;
+ }
+ }
+ if ( isset($scores[$pages]) )
+ {
+ $scores[$pages] = $scores[$pages] + $inc;
}
else
{
- $word_tracking[$pages] = array($row['word']);
+ $scores[$pages] = $inc;
}
}
}
while ( $row = $db->fetchrow() );
}
$db->free_result();
-
+
//
// STAGE 2: FIRST ELIMINATION ROUND
// Iterate through the list of required terms. If a given page is not found to have the required term, eliminate it
//
-
+
foreach ( $query['req'] as $term )
{
foreach ( $word_tracking as $i => $page )
@@ -302,85 +344,108 @@
}
}
}
-
+
//
// STAGE 3: PHRASE SEARCHING
// Use LIKE to find pages with specified phrases. We can do a super-picky single query without another elimination round because
// at this stage we can search the full page_text column instead of relying on a word list.
//
-
+
// We can skip this stage if none of these special terms apply
-
+
$text_col = ( $case_sensitive ) ? 'page_text' : 'lcase(page_text)';
-
+ $name_col = ( $case_sensitive ) ? 'name' : 'lcase(name)';
+ $text_col_join = ( $case_sensitive ) ? 't.page_text' : 'lcase(t.page_text)';
+ $name_col_join = ( $case_sensitive ) ? 'p.name' : 'lcase(p.name)';
+
if ( count($query_phrase['any']) > 0 || count($query_phrase['req']) > 0 )
{
-
+
$where_any = array();
foreach ( $query_phrase['any'] as $term )
{
$term = escape_string_like($term);
if ( !$case_sensitive )
$term = strtolower($term);
- $where_any[] = $term;
+ $where_any[] = "( $text_col LIKE '%$term%' OR $name_col LIKE '%$term%' )";
}
-
- $where_any = ( count($where_any) > 0 ) ? "( $text_col LIKE '%" . implode("%' OR $text_col LIKE '%", $where_any) . "%' )" : '';
-
- // Also do required columns, but use AND to ensure that all required terms are included
+
+ $where_any = ( count($where_any) > 0 ) ? implode(" OR\n ", $where_any) : '';
+
+ // Also do required terms, but use AND to ensure that all required terms are included
$where_req = array();
foreach ( $query_phrase['req'] as $term )
{
$term = escape_string_like($term);
if ( !$case_sensitive )
$term = strtolower($term);
- $where_req[] = $term;
+ $where_req[] = "( $text_col LIKE '%$term%' OR $name_col LIKE '%$term%' )";
}
$and_clause = ( $where_any != '' ) ? 'AND ' : '';
- $where_req = ( count($where_req) > 0 ) ? "{$and_clause}$text_col LIKE '%" . implode("%' AND $text_col LIKE '%", $where_req) . "%'" : '';
-
- $sql = 'SELECT CONCAT("ns=",namespace,";pid=",page_id) AS id FROM ' . table_prefix . "page_text WHERE $where_any $where_req;";
+ $where_req = ( count($where_req) > 0 ) ? "{$and_clause}" . implode(" AND\n ", $where_req) : '';
+
+ $sql = 'SELECT CONCAT("ns=",t.namespace,";pid=",t.page_id) AS id, p.name FROM ' . table_prefix . "page_text AS t\n"
+ . " LEFT JOIN " . table_prefix . "pages AS p\n"
+ . " ON ( p.urlname = t.page_id AND p.namespace = t.namespace )\n"
+ . " WHERE\n $where_any\n $where_req;";
if ( !($q = $db->sql_unbuffered_query($sql)) )
$db->_die('Error is in perform_search(), includes/search.php, query 2. Parsed query dump follows:
Index was not rebuilt due to an error.';
@@ -94,7 +95,7 @@
$qin = ( isset($q) ) ? str_replace('"', '\"', htmlspecialchars($q)) : '';
$search_form = '
';
@@ -133,10 +134,10 @@
foreach ( $results as $i => $_ )
{
$result =& $results[$i];
- $result['page_text'] = str_replace(array('', ''), array('', ''), $result['page_text']);
+ $result['page_text'] = str_replace(array('', ''), array('', ''), $result['page_text']);
if ( !empty($result['page_text']) )
$result['page_text'] .= ' ';
- $result['page_name'] = str_replace(array('', ''), array('', ''), $result['page_name']);
+ $result['page_name'] = str_replace(array('', ''), array('', ''), $result['page_name']);
if ( $result['page_length'] >= 1048576 )
{
$result['page_length'] = round($result['page_length'] / 1048576, 1);
diff -r a1d0846c4504 -r b3cfaf0a505c plugins/SpecialUpdownload.php
--- a/plugins/SpecialUpdownload.php Sat Nov 24 13:16:20 2007 -0500
+++ b/plugins/SpecialUpdownload.php Sun Nov 25 17:53:03 2007 -0500
@@ -218,12 +218,25 @@
global $do_gzip;
$filename = rawurldecode($paths->getParam(0));
$timeid = $paths->getParam(1);
- if($timeid && preg_match('#^([0-9]+)$#', (string)$timeid)) $tid = ' AND time_id='.$timeid;
- else $tid = '';
+ if ( $timeid && preg_match('#^([0-9]+)$#', (string)$timeid) )
+ {
+ $tid = ' AND time_id='.$timeid;
+ }
+ else
+ {
+ $tid = '';
+ }
$filename = $db->escape($filename);
$q = $db->sql_query('SELECT page_id,size,mimetype,time_id,file_extension,file_key FROM '.table_prefix.'files WHERE filename=\''.$filename.'\''.$tid.' ORDER BY time_id DESC;');
- if(!$q) $db->_die('The file data could not be selected.');
- if($db->numrows() < 1) { header('HTTP/1.1 404 Not Found'); die_friendly('File not found', '
The file "'.$filename.'" cannot be found.
'); }
+ if ( !$q )
+ {
+ $db->_die('The file data could not be selected.');
+ }
+ if ( $db->numrows() < 1 )
+ {
+ header('HTTP/1.1 404 Not Found');
+ die_friendly('File not found', '
The file "'.$filename.'" cannot be found.
');
+ }
$row = $db->fetchrow();
$db->free_result();
diff -r a1d0846c4504 -r b3cfaf0a505c plugins/SpecialUserFuncs.php
--- a/plugins/SpecialUserFuncs.php Sat Nov 24 13:16:20 2007 -0500
+++ b/plugins/SpecialUserFuncs.php Sun Nov 25 17:53:03 2007 -0500
@@ -198,13 +198,19 @@
Password:
-
+
Important note regarding cryptography: Some countries do not allow the import or use of cryptographic technology. If you live in one of the countries listed below, you should log in without using encryption.
This restriction applies to the following countries: Belarus, China, India, Israel, Kazakhstan, Mongolia, Pakistan, Russia, Saudi Arabia, Singapore, Tunisia, Venezuela, and Vietnam.
+
+
+
+
Encrypted logon has been disabled. Unless you live in a country where encryption technology is illegal, you should use encryption when you log on to help protect against password sniffing.
+
+
diff -r a1d0846c4504 -r b3cfaf0a505c upgrade.sql
--- a/upgrade.sql Sat Nov 24 13:16:20 2007 -0500
+++ b/upgrade.sql Sun Nov 25 17:53:03 2007 -0500
@@ -10,13 +10,13 @@
-- I have no idea how or why, but the f'ing index didn't get created for who-knows-how-many releases.
-- We'll attempt to create it here, but don't die if it fails
@ALTER TABLE {{TABLE_PREFIX}}page_text ENGINE = MYISAM, COLLATE = utf8_bin, CHARSET = utf8;
-@CREATE FULLTEXT INDEX {{TABLE_PREFIX}}page_search_idx ON {{TABLE_PREFIX}}page_text(page_id, namespace, page_text);
ALTER TABLE {{TABLE_PREFIX}}search_index CHARSET = utf8, COLLATE = utf8_bin, MODIFY COLUMN word varchar(64) NOT NULL;
-- The search cache is no longer needed because of the new unified search engine
@DROP TABLE {{TABLE_PREFIX}}search_cache;
-- Yes, it appears we need pages with names this long after all
ALTER TABLE {{TABLE_PREFIX}}pages MODIFY COLUMN urlname varchar(255), MODIFY COLUMN name varchar(255);
-ALTER TABLE {{TABLE_PREFIX}}page_text MODIFY COLUMN page_id varchar(255);
+ALTER TABLE {{TABLE_PREFIX}}page_text MODIFY COLUMN page_id varchar(255), MODIFY COLUMN namespace varchar(63), MODIFY COLUMN page_text longtext;
+@CREATE FULLTEXT INDEX {{TABLE_PREFIX}}page_search_idx ON {{TABLE_PREFIX}}page_text(page_id, namespace, page_text);
---END 1.0.2b1---
---BEGIN 1.0.1.1---
---END 1.0.1.1---