#!perl use CGI; use CGI::Carp qw(fatalsToBrowser); use CGI::Cookie; use LSE::Config; use LSE::JsDump; use LSE::Database::Connection; use LSE::Database::Services; use URI::Escape; use Encode qw(decode_utf8 encode_utf8); use MIME::Base64; use HTML::Entities; use strict; use warnings; use utf8; =head1 NAME query.pl - Unified query by example and search page =head1 SUMMARY Each action of the script is detailed in its own section: PARSE EXAMPLE SENTENCE, SAVE QUERY, DELETE QUERY, PERFORM SEARCH, NAVIGATE RESULTS. =cut # Container for the data to convert to JavaScript and return to the # browser. our ($q, $obj, $config, $usingChinese, $searchColumn); local ($q, $obj, $config, $usingChinese, $searchColumn); $obj = {}; $config = getGenConfig(); $q = new CGI; # Add info to object $obj->{user} = $ENV{REMOTE_USER}; =head1 PARSE EXAMPLE SENTENCE Uses the parser SOAP server to parse an example sentence. This action is the default if the C parameter is unset. In that case, the sentence "Enter an example sentence" is used. If the "rawqueryText" parameter is set and the action is not "Parse," that is used as the query instead of the parse of "Enter an example sentence." This comes first so that a tree and example sentence are set even if we abort later on. =head3 CGI parameters C - The sentence to parse. C - Example sentence to display but not to parse C - Tree to display in the tree editor. C =cut $q->param('action','Parse') if !$q->param('action') and !$q->param('rawqueryText') and !$q->param('navaction'); my %cookies = fetch CGI::Cookie; my @queryprefs; @queryprefs = $cookies{queryprefs}->value if(exists $cookies{queryprefs}); # Check if we are using Chinese $usingChinese = 0; $usingChinese = 1 if( ($q->param('language') and $q->param('language') eq 'Chinese') or grep {$_ eq 'language:1'} @queryprefs); $searchColumn = 'charniak'; $searchColumn = 'zh_parse' if $usingChinese; my $rawquery = $q->param('rawqueryText'); $rawquery = "" unless defined $rawquery; # If we're coming from the My Collection page.. if($q->param('action') and $q->param('action') eq 'SetSource') { $obj->{sourceOverride} = $q->param('source'); $obj->{tabOverride} = 'searchOptions'; # parse example sentence $q->param('action','Parse'); } # Do query by example if($q->param('action') and $q->param('action') eq 'Parse') { # If we're using Chinese parse a Chinese example sentence if($usingChinese) { $q->param('eg_string',encode_utf8('第八 届 安第斯 议会 会议 十三日 在 利马 开幕')) if(!$q->param('eg_string')); my $unicode_eg = decode_utf8($q->param('eg_string')); $obj->{tree} = decode_utf8(parseSentenceChinese($q->param('eg_string'))); $obj->{eg_string} = $unicode_eg; } else { # Otherwise just use English $q->param('eg_string','Enter an example sentence here.') if(!$q->param('eg_string')); $obj->{tree} = parseSentence($q->param('eg_string')); $obj->{eg_string} = $q->param('eg_string'); } $obj->{expanderOverrides} = { exampleSentence => 1, treeEditor => 1, rawQuery => 1 }; $obj->{tabOverride} = "hidden" if !$obj->{tabOverride}; } elsif($q->param('navaction')) { # Ignore a newly-entered query if we are just navigating the results $obj->{tree} = decode_utf8($q->param('navquery')); $obj->{sourceOverride} = $q->param('navsource'); $q->param('rawqueryText',decode_utf8($q->param('navquery'))); $q->param('source',$q->param('navsource')); if($q->param('eg_string')) { $obj->{eg_string} = decode_utf8($q->param('eg_string')); } else { $obj->{eg_string} = ""; } } else { $obj->{tree} = decode_utf8($q->param('rawqueryText')); if($q->param('eg_string')) { $obj->{eg_string} = decode_utf8($q->param('eg_string')); } else { $obj->{eg_string} = ""; } $obj->{expanderOverrides} = { exampleSentence => 1, treeEditor => 1, rawQuery => 1 }; $obj->{tabOverride} = "hidden" if !$obj->{tabOverride}; } # Set up parameters for tree editor applet getAppletInfo($obj,$obj->{tree},$obj->{eg_string}); =head1 DELETE QUERY CGI parameters C - The ID of the query to delete C =cut if($q->param('action') and $q->param('action') eq 'Delete Query') { if (!$q->param('qid')) { getSavedQueries($obj); queryMessage("You must provide a query ID to delete"); } my $qid = $q->param('qid'); eval { my $dbh = getConnection(); $dbh->begin_work; my $s_qid = $dbh->quote($qid); $dbh->do("DELETE FROM saved_queries WHERE qid = $s_qid"); $dbh->commit; getSavedQueries($obj); queryMessage("Query deleted."); }; getSavedQueries($obj); queryMessage("Error deleting query: $@") if ($@); } =head1 SAVE QUERY Saves a query and the associated source. May eventually remove the source from the saved query info but for now it's required by the database schema. After saving the query, load the user's saved queries including the newly saved one. =head3 CGI parameters C - Short name to display for the saved query. A new saved query will replace an older saved query with the same name. C - A longer description of the saved query C - The raw query itself C =cut if($q->param('action') and $q->param('action') eq 'Save Query') { my $qname = decode_utf8($q->param('savequeryname')); my $qdesc = decode_utf8($q->param('savequerydesc')); my $qquery = decode_utf8($rawquery); my $quid = $ENV{REMOTE_USER}; getSavedQueries($obj) if !$qname or !$qquery or !$quid; queryMessage("You must provide a name for your saved query") if !$qname; queryMessage("You must provide a query to save") if !$qquery; queryMessage("Missing userid") if !$quid; $qdesc = "" if !$qdesc; eval { my $dbh = getConnection(); $dbh->begin_work; my $s_qname = $dbh->quote($qname); my $s_qdesc = $dbh->quote($qdesc); my $s_qquery = $dbh->quote($qquery); my $s_quid = $dbh->quote($quid); $dbh->do("DELETE FROM saved_queries WHERE name = $s_qname and userid = $s_quid"); $dbh->do("INSERT INTO saved_queries (userid, description, query, name) VALUES ($s_quid, $s_qdesc, $s_qquery, $s_qname)"); $dbh->commit; # Load the saved queries getSavedQueries($obj); $obj->{tabOverride} = "loadQuery"; queryMessage("Query saved."); }; # Even if saving the query goes wrong, we need to present the user # with their old saved queries. my $sqerr = $@; getSavedQueries($obj); queryMessage("Error saving query: $sqerr") if ($sqerr); } else { # No query to save, just load the saved queries. getSavedQueries($obj); } =head1 NAVIGATE RESULTS Set the offset/limit appropriately for the given search options and navigation link =head3 CGI parameters C - Number of results to display on one page. C - Number of results displayed on previous page C - Number of results to skip =cut my $offset = 0; my $limit = 20; my $oldlimit = 20; # Nav code from old query.pl: $oldlimit = $q->param('oldlimit') if(defined $q->param('oldlimit')); $limit = $q->param('numresults') if(defined $q->param('numresults')); $offset = $q->param('offset') if(defined $q->param('offset')); # Set expander override and default tab for first page of search results if(defined $q->param('action') and $q->param('action') eq 'Search') { $obj->{expanderOverrides} = { exampleSentence => 0, treeEditor => 0, rawQuery => 0 }; $obj->{tabOverride} = "searchOptions"; # Save the source for the query $obj->{sourceOverride} = $q->param('source'); } if(defined $q->param('navaction') and $q->param('navaction') eq 'next') { # Start displaying at the old offset plus the number of results we # displayed on the previous page $offset += $oldlimit; $q->param('action','Search'); } if(defined $q->param('navaction') and $q->param('navaction') eq 'prev') { $offset -= $limit; $q->param('action','Search'); } my $download = 0; if(defined $q->param('action') and $q->param('action') eq 'Download Results') { $offset = 0; $limit = 0; $q->param('action','Search'); $download = 1; } $offset = 0 if $offset < 0; $oldlimit = $limit; $obj->{navigate} = {offset => $offset, limit => $limit, oldlimit => $oldlimit}; =head1 PERFORM SEARCH Send a query to QueryMistress and retrieve the results. =head3 CGI parameters C - The source to use for the search. Can contain embedded source parameters. C - The textual representation of the query. C =cut if($q->param('action') and $q->param('action') eq 'Search') { my $query = decode_utf8($rawquery); my $source = $q->param('source'); queryMessage("No source selected") if !$source; queryMessage("No query provided") if !$query; doSearch($obj,$query,$source,$limit,$offset); } # Make the html page with the data object if($download) { if($q->param('restype') eq 'normal') { print $q->header('application/x-csv'); downloadResults(); return; } if($q->param('restype') eq 'kwic') { print $q->header('application/x-csv'); downloadKwicResults(); } } else { print $q->header('text/html; charset=UTF-8'); make_html_page($obj,"query.html"); } =head1 METHODS =head2 queryMessage($message) Displays the passed message and aborts. =cut sub queryMessage { my $message = shift; # add error message to output object $obj->{message} = $message; print $q->header('text/html; charset=UTF-8'); make_html_page($obj,"query.html"); exit; #die($message); } =head2 parseSentenceChinese($sentence) Parses the sentence using the Chinese query-by-example SOAP service (uses Stanford Lexicalized Parser) =cut sub parseSentenceChinese { my $sentence = shift; my $qbe_svc = getService("http://lse.umiacs.umd.edu/LSE/Annot/ChineseParse"); my $result = $qbe_svc->do_sentence($sentence); if($result->fault) { # retry /restart if appropriate queryMessage("Error parsing sentence:
" . join(', ', $result->faultcode, $result->faultstring) . "
"); } return $result->result; } =head2 parseSentence($sentence) Parses the sentence using the query-by-example SOAP service (uses Charniak's parser) =cut sub parseSentence { my $sentence = shift; my $qbe_svc = getService("http://lse.umiacs.umd.edu/LSE/Annot/Charniak"); my $result = $qbe_svc->do_sentence($sentence); if($result->fault) { # retry /restart if appropriate queryMessage("Error parsing sentence:
" . join(', ', $result->faultcode, $result->faultstring) . "
"); } return $result->result; } =head2 getSavedQueries($obj) Add the saved query and collection information to the data object to send to the browser. =cut sub getSavedQueries { my $obj = shift; eval { my @queries = (); my $dbh = getConnection(); $dbh->begin_work; $dbh->do("SET LOCAL datestyle TO postgres"); my $s_userid = $dbh->quote($ENV{REMOTE_USER}); my $sth = $dbh->prepare("SELECT *, date_trunc('second',ctime) as jstimestamp FROM saved_queries WHERE userid = $s_userid"); $sth->execute; while(my $row = $sth->fetchrow_hashref) { push(@queries,{qid => $row->{qid}, userid => $row->{userid}, timestamp => $row->{jstimestamp}, description => decode_utf8($row->{description}), name => decode_utf8($row->{name}), query => decode_utf8($row->{query})}); } $obj->{queries} = [@queries]; }; push(@{$obj->{warnings}},"Error getting saved queries: $@") if $@; $obj->{sources} = [ ]; eval { my $dbh = getConnection(); my $s_user = $dbh->quote($ENV{REMOTE_USER}); my $collections = $dbh->selectall_arrayref("select userid, name, sum(scount) as scount, sum(dcount) as dcount from col_queries cq natural join collection_sentence_acount where (userid = $s_user or userid = 'public') and count > 0 and type = '$searchColumn' group by userid,name order by userid, name"); foreach my $col (@$collections) { push(@{$obj->{sources}}, {source => 'userid=' . uri_escape($col->[0]) . '&' . 'name=' . uri_escape($col->[1]), caption => ($col->[0] eq 'public' ? 'Public Collection: ' : 'My Collection: ') . $col->[1] . ", $col->[2] sentences in $col->[3] documents", # default collection - lseweb bydefault => ($col->[1] eq 'lseweb')}); } }; push(@{$obj->{warnings}},"Error getting collections: $@") if $@; } =head2 getAppletInfo($obj,$tree,$example_string) Add the applet parameters to the data object to send to the browser =cut sub getAppletInfo { my $obj = shift; my $tree = shift; my $example_string = shift; our $config; $obj->{applet} = { archive => $config->{qbe}{applet_jarfile}, code => $config->{qbe}{applet_code}, className => $config->{qbe}{applet_class}, name => $config->{qbe}{applet_name}, height => $config->{qbe}{applet_height}, width => $config->{qbe}{applet_width} }; $obj->{applet_params} = { URLExpand => $config->{qbe}{applet_expand_param}, URLWordNet => $config->{qbe}{applet_wordnet_param}, tree => uri_escape(encode_utf8($tree)) }; } =head2 doSearch($obj,$query,$limit,$offset) Perform the given query on the given source, returning results offset through offset + limit and attaching the results to the given data object. Marshals the simple query/source parameter into the more complicated representation used by QueryMistress. =cut sub doSearch { my $obj = shift; my $query = shift; my $source = shift; my $limit = shift; my $offset = shift; my $qm_svc = getService("http://lse.umiacs.umd.edu/LSE/Query/Mistress"); my $qm_query = {id => 'constituency', searchstring => encode_utf8($query)}; # restrict search by level if that is desired my $level = $q->param('level'); if($level and $level ne 'Any') { $qm_query->{level} = $level; } my $sourceparams = {id => 'source'}; my @sourcebits = split('&',$source); # decode the source parameters. They're in a cgi-ish format. foreach my $sbit (@sourcebits) { my ($key,$val) = split('=',$sbit); $sourceparams->{uri_unescape($key)} = uri_unescape($val); } # Default annotation type to query $sourceparams->{atype} = $searchColumn; my $result = $qm_svc->performQueries({source => $sourceparams, query => $qm_query, limit => $limit, offset => $offset}); if($result->fault) { # retry /restart if appropriate queryMessage("Error performing query:
" . join(', ', $result->faultcode, $result->faultstring) . "
"); } # FIXME: Reasons why query might return no results queryMessage("Query returned no results") if not defined $result->result; $obj->{result} = $result->result; use Data::Dumper; # queryMessage("No more results ($offset)". "
". Dumper($source,$sourceparams,$qm_query,$limit,$offset,$result->result) . "
") if $offset > $obj->{result}{num_results}; queryMessage("No more results") if $offset > $obj->{result}{num_results}; # manually decode warnings & errors from UTF-8 - need to do this because SOAP # gets confused when presented with unicode strings, which is not really very smart.. for(my $erridx = 0; $erridx < @{$obj->{result}{warnings}}; $erridx++) { $obj->{result}{warnings}[$erridx] = decode_utf8($obj->{result}{warnings}[$erridx]); } for(my $erridx = 0; $erridx < @{$obj->{result}{errors}}; $erridx++) { $obj->{result}{errors}[$erridx] = decode_utf8($obj->{result}{errors}[$erridx]); } my $dbh = getConnection(); my @rbody = (); foreach my $sentence (@{$obj->{result}{results}}) { eval { my $docid = $sentence->[0]; my $seqid = $sentence->[1]; my $sinfo = { docid => $docid, seqid => $seqid}; my $sth = $dbh->prepare("select type,data from document_annots where docid = '$docid'"); $sth->execute(); while(my $row = $sth->fetchrow_arrayref) { $sinfo->{$row->[0]} = decode_utf8($row->[1]); } $sth = $dbh->prepare("select type,data from annotations where docid = '$docid' and seqid = '$seqid'"); $sth->execute; while(my $row = $sth->fetchrow_arrayref) { $sinfo->{$row->[0]} = decode_utf8($row->[1]); } $sinfo->{archived} = "/lse/ia.pl?docid=" . $sinfo->{docid} . "&seqid=". $sinfo->{seqid}; $sinfo->{archived} .= "&lang=zh" if($usingChinese); $sinfo->{annot} = "/lse/display_sentence.pl?docid=" . $sinfo->{docid} . "&seqid=". $sinfo->{seqid}; $sinfo->{annot} .= "&lang=zh" if($usingChinese); $sinfo->{filtered} = filterSentence($sinfo->{uri},$sinfo->{body}); $sinfo->{best_level} = formatBestLevel($sinfo->{leveler}) if(exists $sinfo->{leveler}); delete $sinfo->{best_level} unless defined $sinfo->{best_level}; push(@rbody,$sinfo); }; push(@{$obj->{warnings}},$@) if($@); } $obj->{result}{results} = [@rbody]; } =head2 filterSentence($uri,$sentence) Checks URI and sentence body against filter keywords; returns true (1) if the URI contains any keyword as a substring or the sentence body contains one of the keywords separated from other stuff by perl re \b or \W on both sides. Returns false (0) if there is no match. =cut sub filterSentence { my $uri = shift; my $sentence = shift; my @words = getOffensiveWords(); foreach my $item (@words){ if($uri =~ /$item/i or $sentence =~ /(\b|\W)$item(\b|\W)/i) { return 1; } } return 0; } =head1 AUTHOR Aaron Elkiss =cut sub downloadResults { print qq("url","internet_archive_url","sentence"\n); my $filter = $q->param('filter'); foreach my $sentence (@{$obj->{result}{results}}) { printCsvRecord($sentence,$filter); } exit; } sub downloadKwicResults { my $word = $q->param('kwickeyword'); $word = "" if(not defined $word); my $filter = $q->param('filter'); print qq("url","internet_archive_url","pre","word","post"\n); foreach my $sentence (@{$obj->{result}{results}}) { printKwicRecord($sentence,$word,$filter) } } sub printCsvRecord { my $sentence = shift; my $filter = shift; my $field = "body"; $field = "zh_body" if $usingChinese; return if $filter and $sentence->{filtered}; my $iaurl = makeIaUrl($sentence->{uri},$sentence->{timestamp}); $sentence->{uri} =~ s/\"/\"\"/g; $sentence->{$field} =~ s/\"/\"\"/g; $sentence->{$field} =~ s/\n//gm; print '"', join('","',$sentence->{uri}, $iaurl, $sentence->{$field}), '"', "\n"; } sub printKwicRecord { my $sentence = shift; my $keyword = decode_utf8(shift); my $filter = shift; our $config; my $field = "body"; $field = "zh_body" if $usingChinese; return if $filter and $sentence->{filtered}; my $iaurl = makeIaUrl($sentence->{uri},$sentence->{timestamp}); $sentence->{uri} =~ s/\"/\"\"/g; $sentence->{$field} =~ s/\"/\"\"/g; $sentence->{$field} =~ s/\n//gm; my($pre,$word,$post) = $sentence->{$field} =~ /^(.*)($keyword)(.*)/i; return if not defined $word; $pre = "" if not defined $pre; $post = "" if not defined $post; print '"', join('","',$sentence->{uri}, $iaurl, $pre, $word, $post), '"', "\n"; } sub makeIaUrl { my ($url, $timestamp) = @_; return "" if not defined $url; $timestamp = "*" if not defined $timestamp or !$timestamp; return "http://web.archive.org/web/$timestamp/$url"; } sub formatBestLevel { my ($level) = @_; my @levels = ($level =~ /^L2:(\S+) L3:(\S+) L4:(\S+)/) or return undef; my $maxlevel = 0; for (my $i = 1; $i < @levels; $i++) { if($levels[$i] > $levels[$maxlevel]) { $maxlevel = $i; } } return sprintf('Level %d: %2.2f%% confident',$maxlevel + 2,$levels[$maxlevel] * 100); }