* [RFC 0/3] search: more mairix compatibility changes
@ 2016-09-07 23:57 Eric Wong
2016-09-07 23:57 ` [PATCH 1/3] search: allow searching user fields (To/Cc/From) Eric Wong
` (2 more replies)
0 siblings, 3 replies; 4+ messages in thread
From: Eric Wong @ 2016-09-07 23:57 UTC (permalink / raw)
To: spew
1/3 and 2/3 should not be problematic, but 3/3 seems to be...
Eric Wong (3):
search: allow searching user fields (To/Cc/From)
search: drop longer subject: prefix for search
search: more granular message body searching
^ permalink raw reply [flat|nested] 4+ messages in thread
* [PATCH 1/3] search: allow searching user fields (To/Cc/From)
2016-09-07 23:57 [RFC 0/3] search: more mairix compatibility changes Eric Wong
@ 2016-09-07 23:57 ` Eric Wong
2016-09-07 23:57 ` [PATCH 2/3] search: drop longer subject: prefix for search Eric Wong
2016-09-07 23:57 ` [PATCH 3/3] search: more granular message body searching Eric Wong
2 siblings, 0 replies; 4+ messages in thread
From: Eric Wong @ 2016-09-07 23:57 UTC (permalink / raw)
To: spew
Sometimes it can be useful to search based on who the
message was sent to, sent by, or Cc:-ed. Of course,
headers can be faked, but they usually are not...
Anyways this mostly matches the behavior of mairix(1).
---
lib/PublicInbox/Search.pm | 10 +++++++-
lib/PublicInbox/SearchIdx.pm | 59 +++++++++++++++++++++++++++++++-------------
t/search.t | 37 +++++++++++++++++++++++++++
3 files changed, 88 insertions(+), 18 deletions(-)
diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index 445c2d8..aec459b 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -51,8 +51,8 @@ my %bool_pfx_internal = (
thread => 'G', # newsGroup (or similar entity - e.g. a web forum name)
);
-# do we still need these? probably not..
my %bool_pfx_external = (
+ # do we still need these? probably not..
path => 'XPATH',
mid => 'Q', # uniQue id (Message-ID)
);
@@ -61,6 +61,14 @@ my %prob_prefix = (
subject => 'S',
s => 'S', # for mairix compatibility
m => 'Q', # 'mid' is exact, 'm' can do partial
+ f => 'A', # for mairix compatibility
+ t => 'XTO', # for mairix compatibility
+ tc => 'XTC', # for mairix compatibility
+ c => 'XCC', # for mairix compatibility
+ tcf => 'XTCF', # for mairix compatibility
+ # n.b.: leaving out "a:" alias for "tcf:" even though
+ # mairix supports it. It is only mentioned in passing in mairix(1)
+ # and the extra two letters are not significantly longer.
);
# not documenting m: and mid: for now, the using the URLs works w/o Xapian
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index f54f5f2..37fefbe 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -96,12 +96,51 @@ sub _lock_release {
close $lockfh or die "close failed: $!\n";
}
-sub add_val {
+sub add_val ($$$) {
my ($doc, $col, $num) = @_;
$num = Search::Xapian::sortable_serialise($num);
$doc->add_value($col, $num);
}
+sub add_values ($$$) {
+ my ($smsg, $bytes, $num) = @_;
+
+ my $ts = $smsg->ts;
+ my $doc = $smsg->{doc};
+ add_val($doc, &PublicInbox::Search::TS, $ts);
+
+ defined($num) and add_val($doc, &PublicInbox::Search::NUM, $num);
+
+ defined($bytes) and add_val($doc, &PublicInbox::Search::BYTES, $bytes);
+
+ add_val($doc, &PublicInbox::Search::LINES,
+ $smsg->{mime}->body_raw =~ tr!\n!\n!);
+
+ my $yyyymmdd = strftime('%Y%m%d', gmtime($ts));
+ $doc->add_value(&PublicInbox::Search::YYYYMMDD, $yyyymmdd);
+}
+
+sub index_users ($$) {
+ my ($tg, $smsg) = @_;
+
+ my $from = $smsg->from;
+ my $to = $smsg->to;
+ my $cc = $smsg->cc;
+
+ $tg->index_text($from, 1, 'A'); # A - author
+ $tg->increase_termpos;
+
+ $tg->index_text($to, 1, 'XTO') if $to ne '';
+ $tg->index_text($cc, 1, 'XCC') if $cc ne '';
+ my $tc = join("\t", $to, $cc);
+ $tg->index_text($tc, 1, 'XTC') if $tc ne '';
+ my $tcf = join("\t", $tc, $from);
+ $tg->index_text($tcf, 1, 'XTCF') if $tcf ne '';
+
+ $tg->index_text($from);
+ $tg->increase_termpos;
+}
+
sub add_message {
my ($self, $mime, $bytes, $num, $blob) = @_; # mime = Email::MIME object
my $db = $self->{xdb};
@@ -129,20 +168,7 @@ sub add_message {
$doc->add_term(xpfx('path') . id_compress($path));
}
- my $ts = $smsg->ts;
- add_val($doc, &PublicInbox::Search::TS, $ts);
-
- defined($num) and
- add_val($doc, &PublicInbox::Search::NUM, $num);
-
- defined($bytes) and
- add_val($doc, &PublicInbox::Search::BYTES, $bytes);
-
- add_val($doc, &PublicInbox::Search::LINES,
- $mime->body_raw =~ tr!\n!\n!);
-
- my $yyyymmdd = strftime('%Y%m%d', gmtime($ts));
- $doc->add_value(&PublicInbox::Search::YYYYMMDD, $yyyymmdd);
+ add_values($smsg, $bytes, $num);
my $tg = $self->term_generator;
@@ -152,8 +178,7 @@ sub add_message {
$tg->index_text($subj) if $subj;
$tg->increase_termpos;
- $tg->index_text($smsg->from);
- $tg->increase_termpos;
+ index_users($tg, $smsg);
msg_iter($mime, sub {
my ($part, $depth, @idx) = @{$_[0]};
diff --git a/t/search.t b/t/search.t
index db94c0a..bb0861a 100644
--- a/t/search.t
+++ b/t/search.t
@@ -86,6 +86,7 @@ my $rw_commit = sub {
'Message-ID' => '<last@s>',
From => 'John Smith <js@example.com>',
To => 'list@example.com',
+ Cc => 'foo@example.com',
],
body => "goodbye forever :<\n");
@@ -324,6 +325,42 @@ sub filter_mids {
is(scalar @{$res->{msgs}}, 0, 'nothing before 19931001');
}
+# names and addresses
+{
+ my $res = $ro->query('t:list@example.com');
+ is(scalar @{$res->{msgs}}, 6, 'searched To: successfully');
+ foreach my $smsg (@{$res->{msgs}}) {
+ like($smsg->to, qr/\blist\@example\.com\b/, 'to appears');
+ }
+
+ $res = $ro->query('tc:list@example.com');
+ is(scalar @{$res->{msgs}}, 6, 'searched To+Cc: successfully');
+ foreach my $smsg (@{$res->{msgs}}) {
+ my $tocc = join("\n", $smsg->to, $smsg->cc);
+ like($tocc, qr/\blist\@example\.com\b/, 'tocc appears');
+ }
+
+ foreach my $pfx ('tcf:', 'c:') {
+ $res = $ro->query($pfx . 'foo@example.com');
+ is(scalar @{$res->{msgs}}, 1,
+ "searched $pfx successfully for Cc:");
+ foreach my $smsg (@{$res->{msgs}}) {
+ like($smsg->cc, qr/\bfoo\@example\.com\b/,
+ 'cc appears');
+ }
+ }
+
+ foreach my $pfx ('', 'tcf:', 'f:') {
+ $res = $ro->query($pfx . 'Laggy');
+ is(scalar @{$res->{msgs}}, 1,
+ "searched $pfx successfully for From:");
+ foreach my $smsg (@{$res->{msgs}}) {
+ like($smsg->from, qr/Laggy Sender/,
+ "From appears with $pfx");
+ }
+ }
+}
+
done_testing();
1;
--
EW
^ permalink raw reply related [flat|nested] 4+ messages in thread
* [PATCH 2/3] search: drop longer subject: prefix for search
2016-09-07 23:57 [RFC 0/3] search: more mairix compatibility changes Eric Wong
2016-09-07 23:57 ` [PATCH 1/3] search: allow searching user fields (To/Cc/From) Eric Wong
@ 2016-09-07 23:57 ` Eric Wong
2016-09-07 23:57 ` [PATCH 3/3] search: more granular message body searching Eric Wong
2 siblings, 0 replies; 4+ messages in thread
From: Eric Wong @ 2016-09-07 23:57 UTC (permalink / raw)
To: spew
We only document the "s:" anyways. While the long name is more
descriptive, the ambiguity makes agnostic caching (by Varnish or
similar) slightly harder and longer URLs are more likely to be
accidentally truncated when shared.
---
lib/PublicInbox/Search.pm | 1 -
t/search.t | 14 +++++++-------
2 files changed, 7 insertions(+), 8 deletions(-)
diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index aec459b..3b25b66 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -58,7 +58,6 @@ my %bool_pfx_external = (
);
my %prob_prefix = (
- subject => 'S',
s => 'S', # for mairix compatibility
m => 'Q', # 'mid' is exact, 'm' can do partial
f => 'A', # for mairix compatibility
diff --git a/t/search.t b/t/search.t
index bb0861a..7abaf83 100644
--- a/t/search.t
+++ b/t/search.t
@@ -123,19 +123,19 @@ sub filter_mids {
is($res->{total}, 0, "path variant `$p' does not match");
}
- $res = $ro->query('subject:(Hello world)');
+ $res = $ro->query('s:(Hello world)');
@res = filter_mids($res);
- is_deeply(\@res, \@exp, 'got expected results for subject:() match');
+ is_deeply(\@res, \@exp, 'got expected results for s:() match');
- $res = $ro->query('subject:"Hello world"');
+ $res = $ro->query('s:"Hello world"');
@res = filter_mids($res);
- is_deeply(\@res, \@exp, 'got expected results for subject:"" match');
+ is_deeply(\@res, \@exp, 'got expected results for s:"" match');
- $res = $ro->query('subject:"Hello world"', {limit => 1});
+ $res = $ro->query('s:"Hello world"', {limit => 1});
is(scalar @{$res->{msgs}}, 1, "limit works");
my $first = $res->{msgs}->[0];
- $res = $ro->query('subject:"Hello world"', {offset => 1});
+ $res = $ro->query('s:"Hello world"', {offset => 1});
is(scalar @{$res->{msgs}}, 1, "offset works");
my $second = $res->{msgs}->[0];
@@ -181,7 +181,7 @@ sub filter_mids {
$rw_commit->();
$ro->reopen;
- # Subject:
+ # subject
my $res = $ro->query('ghost');
my @exp = sort qw(ghost-message@s ghost-reply@s);
my @res = filter_mids($res);
--
EW
^ permalink raw reply related [flat|nested] 4+ messages in thread
* [PATCH 3/3] search: more granular message body searching
2016-09-07 23:57 [RFC 0/3] search: more mairix compatibility changes Eric Wong
2016-09-07 23:57 ` [PATCH 1/3] search: allow searching user fields (To/Cc/From) Eric Wong
2016-09-07 23:57 ` [PATCH 2/3] search: drop longer subject: prefix for search Eric Wong
@ 2016-09-07 23:57 ` Eric Wong
2 siblings, 0 replies; 4+ messages in thread
From: Eric Wong @ 2016-09-07 23:57 UTC (permalink / raw)
To: spew
"bs:" and "b:" are adapted from mairix(1)
We will also support searching explicitly for quoted vs
non-quoted text via "q:" and "nq:" prefixes since sometimes
readers will not care for quoted text.
In the future, we will support parsing diffs (perhaps when
repobrowse integration is complete).
Note: this roughly doubles the size of the Xapian database due
to the additional information; so this change may not be worth
it.
---
lib/PublicInbox/Search.pm | 18 ++++++++++++------
lib/PublicInbox/SearchIdx.pm | 17 ++++++++++++++---
t/search.t | 25 +++++++++++++++++++++++++
3 files changed, 51 insertions(+), 9 deletions(-)
diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index 3b25b66..f74129d 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -58,16 +58,22 @@ my %bool_pfx_external = (
);
my %prob_prefix = (
- s => 'S', # for mairix compatibility
+ # for mairix compatibility
+ s => 'S',
m => 'Q', # 'mid' is exact, 'm' can do partial
- f => 'A', # for mairix compatibility
- t => 'XTO', # for mairix compatibility
- tc => 'XTC', # for mairix compatibility
- c => 'XCC', # for mairix compatibility
- tcf => 'XTCF', # for mairix compatibility
+ f => 'A',
+ t => 'XTO',
+ tc => 'XTC',
+ c => 'XCC',
+ tcf => 'XTCF',
+ b => 'XBODY',
+ bs => 'XBS',
+
# n.b.: leaving out "a:" alias for "tcf:" even though
# mairix supports it. It is only mentioned in passing in mairix(1)
# and the extra two letters are not significantly longer.
+ q => 'XQUOT',
+ nq => 'XNQ',
);
# not documenting m: and mid: for now, the using the URLs works w/o Xapian
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 37fefbe..cd27a29 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -173,7 +173,10 @@ sub add_message {
my $tg = $self->term_generator;
$tg->set_document($doc);
- $tg->index_text($subj, 1, 'S') if $subj;
+ if ($subj) {
+ $tg->index_text($subj, 1, 'S');
+ $tg->index_text($subj, 1, 'XBS');
+ }
$tg->increase_termpos;
$tg->index_text($subj) if $subj;
$tg->increase_termpos;
@@ -199,13 +202,21 @@ sub add_message {
}
}
if (@quot) {
- $tg->index_text(join("\n", @quot), 0);
+ my $s = join("\n", @quot);
@quot = ();
+ $tg->index_text($s, 1, 'XQUOT');
+ $tg->index_text($s, 0, 'XBS');
+ $tg->index_text($s, 0, 'XBODY');
+ $tg->index_text($s, 0);
$tg->increase_termpos;
}
if (@orig) {
- $tg->index_text(join("\n", @orig));
+ my $s = join("\n", @orig);
@orig = ();
+ $tg->index_text($s, 1, 'XNQ');
+ $tg->index_text($s, 1, 'XBS');
+ $tg->index_text($s, 1, 'XBODY');
+ $tg->index_text($s);
$tg->increase_termpos;
}
});
diff --git a/t/search.t b/t/search.t
index 7abaf83..bddb545 100644
--- a/t/search.t
+++ b/t/search.t
@@ -361,6 +361,31 @@ sub filter_mids {
}
}
+{
+ $rw_commit->();
+ $ro->reopen;
+ my $res = $ro->query('b:hello');
+ is(scalar @{$res->{msgs}}, 0, 'no match on body search only');
+ $res = $ro->query('bs:smith');
+ is(scalar @{$res->{msgs}}, 0,
+ 'no match on body+subject search for From');
+
+ $res = $ro->query('q:theatre');
+ is(scalar @{$res->{msgs}}, 1, 'only one quoted body');
+ like($res->{msgs}->[0]->from, qr/\AQuoter/, 'got quoted body');
+
+ $res = $ro->query('nq:theatre');
+ is(scalar @{$res->{msgs}}, 1, 'only one non-quoted body');
+ like($res->{msgs}->[0]->from, qr/\ANon-Quoter/, 'got non-quoted body');
+
+ foreach my $pfx (qw(b: bs:)) {
+ $res = $ro->query($pfx . 'theatre');
+ is(scalar @{$res->{msgs}}, 2, "searched both bodies for $pfx");
+ like($res->{msgs}->[0]->from, qr/\ANon-Quoter/,
+ "non-quoter first for $pfx");
+ }
+}
+
done_testing();
1;
--
EW
^ permalink raw reply related [flat|nested] 4+ messages in thread
end of thread, other threads:[~2016-09-07 23:57 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-09-07 23:57 [RFC 0/3] search: more mairix compatibility changes Eric Wong
2016-09-07 23:57 ` [PATCH 1/3] search: allow searching user fields (To/Cc/From) Eric Wong
2016-09-07 23:57 ` [PATCH 2/3] search: drop longer subject: prefix for search Eric Wong
2016-09-07 23:57 ` [PATCH 3/3] search: more granular message body searching Eric Wong
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).