about summary refs log tree commit homepage
path: root/lib/PublicInbox/IMAP.pm
diff options
context:
space:
mode:
authorEric Wong <e@yhbt.net>2020-06-12 23:49:24 +0000
committerEric Wong <e@yhbt.net>2020-06-13 07:55:45 +0000
commit3d52c093ad5ce7a32f8842d9ae020712f9786352 (patch)
treef375b25be12526cbbae99e8748cb0738e83a0eed /lib/PublicInbox/IMAP.pm
parentc5fb8d66dfc1fa7034c6e7350bc6474cdde6b6a7 (diff)
downloadpublic-inbox-3d52c093ad5ce7a32f8842d9ae020712f9786352.tar.gz
Since we limit our mailboxes slices to 50K and can guarantee a
contiguous UID space for those mailboxes, we can store a mapping
of "UID offsets" (not full UIDs) to Message Sequence Numbers as
an array of 16-bit unsigned integers in a 100K scalar.

For UID-only FETCH responses, we can momentarily unpack the
compact 100K representation to a ~1.6M Perl array of IV/UV
elements for a slight speedup.

Furthermore, we can (ab)use hash key deduplication in Perl5 to
deduplicate this 100K scalar across all clients with the same
mailbox slice open.

Technically we can increase our slice size to 64K w/o increasing
our storage overhead, but I suspect humans are more accustomed
to slices easily divisible by 10.
Diffstat (limited to 'lib/PublicInbox/IMAP.pm')
-rw-r--r--lib/PublicInbox/IMAP.pm178
1 files changed, 133 insertions, 45 deletions
diff --git a/lib/PublicInbox/IMAP.pm b/lib/PublicInbox/IMAP.pm
index ff01d0b5..bc890517 100644
--- a/lib/PublicInbox/IMAP.pm
+++ b/lib/PublicInbox/IMAP.pm
@@ -16,18 +16,17 @@
 #   are for the limitations of git clients, while slices are
 #   for the limitations of IMAP clients.
 #
-# * sequence numbers are estimated based on slice.  If they
-#   wrong, they're higher than than the corresponding UID
-#   because UIDs have gaps due to spam removals.
-#   We only support an ephemeral mapping non-UID "FETCH"
-#   because mutt header caching relies on it; mutt uses
-#   UID requests everywhere else.
+# * We also take advantage of slices being only 50K to store
+#   "UID offset" to message sequence number (MSN) mapping
+#   as a 50K uint16_t array (via pack("S*", ...)).  "UID offset"
+#   is the offset from {uid_base} which determines the start of
+#   the mailbox slice.
 
 package PublicInbox::IMAP;
 use strict;
 use base qw(PublicInbox::DS);
 use fields qw(imapd ibx long_cb -login_tag
-        uid_base -idle_tag -idle_max);
+        uid_base -idle_tag uo2m);
 use PublicInbox::Eml;
 use PublicInbox::EmlContentFoo qw(parse_content_disposition);
 use PublicInbox::DS qw(now);
@@ -50,7 +49,9 @@ die "neither Email::Address::XS nor Mail::Address loaded: $@" if !$Address;
 
 sub LINE_MAX () { 8000 } # RFC 2683 3.2.1.5
 
-# changing this will cause grief for clients which cache
+# Changing UID_SLICE will cause grief for clients which cache.
+# This also needs to be <64K: we pack it into a uint16_t
+# for long_response UID (offset) => MSN mappings
 sub UID_SLICE () { 50_000 }
 
 # these values area also used for sorting
@@ -196,23 +197,118 @@ sub cmd_capability ($$) {
 
 sub cmd_noop ($$) { "$_[1] OK Noop done\r\n" }
 
+# uo2m: UID Offset to MSN, this is an arrayref by default,
+# but uo2m_hibernate can compact and deduplicate it
+sub uo2m_ary_new ($) {
+        my ($self) = @_;
+        my $base = $self->{uid_base};
+        my $uids = $self->{ibx}->over->uid_range($base + 1, $base + UID_SLICE);
+
+        # convert UIDs to offsets from {base}
+        my @tmp; # [$UID_OFFSET] => $MSN
+        my $msn = 0;
+        ++$base;
+        $tmp[$_ - $base] = ++$msn for @$uids;
+        \@tmp;
+}
+
+# changes UID-offset-to-MSN mapping into a deduplicated scalar:
+# uint16_t uo2m[UID_SLICE].
+# May be swapped out for idle clients if THP is disabled.
+sub uo2m_hibernate ($) {
+        my ($self) = @_;
+        ref(my $uo2m = $self->{uo2m}) or return;
+        my %dedupe = ( uo2m_pack($uo2m) => undef );
+        $self->{uo2m} = (keys(%dedupe))[0];
+        undef;
+}
+
+sub uo2m_last_uid ($) {
+        my ($self) = @_;
+        my $uo2m = $self->{uo2m} or die 'BUG: uo2m_last_uid w/o {uo2m}';
+        (ref($uo2m) ? @$uo2m : (length($uo2m) >> 1)) + $self->{uid_base};
+}
+
+sub uo2m_pack ($) {
+        # $_[0] is an arrayref of MSNs, it may have undef gaps if there
+        # are gaps in the corresponding UIDs: [ msn1, msn2, undef, msn3 ]
+        no warnings 'uninitialized';
+        pack('S*', @{$_[0]});
+}
+
+# extend {uo2m} to account for new messages which arrived since
+# {uo2m} was created.
+sub uo2m_extend ($$) {
+        my ($self, $new_uid_max) = @_;
+        defined(my $uo2m = $self->{uo2m}) or
+                return($self->{uo2m} = uo2m_ary_new($self));
+        my $beg = uo2m_last_uid($self); # last UID we've learned
+        return $uo2m if $beg >= $new_uid_max; # fast path
+
+        # need to extend the current range:
+        my $base = $self->{uid_base};
+        ++$beg;
+        my $uids = $self->{ibx}->over->uid_range($beg, $base + UID_SLICE);
+        my @tmp; # [$UID_OFFSET] => $MSN
+        if (ref($uo2m)) {
+                my $msn = $uo2m->[-1];
+                $tmp[$_ - $beg] = ++$msn for @$uids;
+                push @$uo2m, @tmp;
+                $uo2m;
+        } else {
+                my $msn = unpack('S', substr($uo2m, -2, 2));
+                $tmp[$_ - $beg] = ++$msn for @$uids;
+                $uo2m .= uo2m_pack(\@tmp);
+                my %dedupe = ($uo2m => undef);
+                $self->{uo2m} = (keys %dedupe)[0];
+        }
+}
+
+# the flexible version which works on scalars and array refs.
+# Must call uo2m_extend before this
+sub uid2msn ($$) {
+        my ($self, $uid) = @_;
+        my $uo2m = $self->{uo2m};
+        my $off = $uid - $self->{uid_base} - 1;
+        ref($uo2m) ? $uo2m->[$off] : unpack('S', substr($uo2m, $off << 1, 2));
+}
+
+# returns an arrayref of UIDs, so MSNs can be translated to UIDs via:
+# $msn2uid->[$MSN-1] => $UID.  The result of this is always ephemeral
+# and does not live beyond the event loop.
+sub msn2uid ($) {
+        my ($self) = @_;
+        my $base = $self->{uid_base};
+        my $uo2m = uo2m_extend($self, $base + UID_SLICE);
+        $uo2m = [ unpack('S*', $uo2m) ] if !ref($uo2m);
+
+        my $uo = 0;
+        my @msn2uid;
+        for my $msn (@$uo2m) {
+                ++$uo;
+                $msn2uid[$msn - 1] = $uo + $base if $msn;
+        }
+        \@msn2uid;
+}
+
+# converts a set of message sequence numbers in requests to UIDs:
+sub msn_to_uid_range ($$) {
+        my $msn2uid = $_[0];
+        $_[1] =~ s!([0-9]+)!$msn2uid->[$1 - 1] // ($msn2uid->[-1] + 1)!sge;
+}
+
 # called by PublicInbox::InboxIdle
 sub on_inbox_unlock {
         my ($self, $ibx) = @_;
-        my $new = $ibx->over->max;
-        my $uid_base = $self->{uid_base} // 0;
-        my $uid_end = $uid_base + UID_SLICE;
-        defined(my $old = $self->{-idle_max}) or die 'BUG: -idle_max unset';
-        $new = $uid_end if $new > $uid_end;
+        my $old = uo2m_last_uid($self);
+        my $uid_end = $self->{uid_base} + UID_SLICE;
+        uo2m_extend($self, $uid_end);
+        my $new = uo2m_last_uid($self);
         if ($new > $old) {
-                $self->{-idle_max} = $new;
-                $new -= $uid_base;
-                $old -= $uid_base;
-                $self->msg_more("* $_ EXISTS\r\n") for (($old + 1)..($new - 1));
-                $self->write(\"* $new EXISTS\r\n");
+                my $msn = uid2msn($self, $new);
+                $self->write(\"* $msn EXISTS\r\n");
         } elsif ($new == $uid_end) { # max exceeded $uid_end
                 # continue idling w/o inotify
-                delete $self->{-idle_max};
                 my $sock = $self->{sock} or return;
                 $ibx->unsubscribe_unlock(fileno($sock));
         }
@@ -245,9 +341,9 @@ sub cmd_idle ($$) {
         my $fd = fileno($sock);
         # only do inotify on most recent slice
         if ($max < $uid_end) {
+                uo2m_extend($self, $uid_end);
                 $ibx->subscribe_unlock($fd, $self);
                 $self->{imapd}->idler_start;
-                $self->{-idle_max} = $max;
         }
         $idle_timer //= PublicInbox::DS::later(\&idle_tick_all);
         $IDLERS->{$fd} = $self;
@@ -497,12 +593,10 @@ sub requeue_once ($) {
         $self->requeue if $new_size == 1;
 }
 
-# my ($msn, $UID) = @_;
-sub fetch_msn_uid ($$) { '* '.(${$_[0]}++).' FETCH (UID '.$_[1] }
-
 sub fetch_run_ops {
-        my ($self, $msn, $smsg, $bref, $ops, $partial) = @_;
-        $self->msg_more(fetch_msn_uid($msn, $smsg->{num}));
+        my ($self, $smsg, $bref, $ops, $partial) = @_;
+        my $uid = $smsg->{num};
+        $self->msg_more('* '.uid2msn($self, $uid)." FETCH (UID $uid");
         my ($eml, $k);
         for (my $i = 0; $i < @$ops;) {
                 $k = $ops->[$i++];
@@ -523,7 +617,7 @@ sub fetch_blob_cb { # called by git->cat_async via git_async_cat
         } else {
                 $smsg->{blob} eq $oid or die "BUG: $smsg->{blob} != $oid";
         }
-        fetch_run_ops($self, $range_info->[3], $smsg, $bref, $ops, $partial);
+        fetch_run_ops($self, $smsg, $bref, $ops, $partial);
         requeue_once($self);
 }
 
@@ -627,8 +721,7 @@ sub range_step ($$) {
         } else {
                 return 'BAD fetch range';
         }
-        my $msn = $beg - $uid_base;
-        [ $beg, $end, $$range_csv, \$msn ];
+        [ $beg, $end, $$range_csv ];
 }
 
 sub refill_range ($$$) {
@@ -653,6 +746,7 @@ sub fetch_blob { # long_response
                         return;
                 }
         }
+        uo2m_extend($self, $msgs->[-1]->{num});
         git_async_cat($self->{ibx}->git, $msgs->[0]->{blob},
                         \&fetch_blob_cb, \@_);
 }
@@ -665,7 +759,8 @@ sub fetch_smsg { # long_response
                         return;
                 }
         }
-        fetch_run_ops($self, $range_info->[3], $_, undef, $ops) for @$msgs;
+        uo2m_extend($self, $msgs->[-1]->{num});
+        fetch_run_ops($self, $_, undef, $ops) for @$msgs;
         @$msgs = ();
         1; # more
 }
@@ -696,10 +791,12 @@ sub fetch_uid { # long_response
                 $self->write("$tag $err\r\n");
                 return;
         }
+        my $adj = $self->{uid_base} + 1;
+        my $uo2m = uo2m_extend($self, $uids->[-1]);
+        $uo2m = [ unpack('S*', $uo2m) ] if !ref($uo2m);
         my ($i, $k);
-        my $msn = $range_info->[3];
         for (@$uids) {
-                $self->msg_more(fetch_msn_uid($msn, $_));
+                $self->msg_more("* $uo2m->[$_ - $adj] FETCH (UID $_");
                 for ($i = 0; $i < @$ops;) {
                         $k = $ops->[$i++];
                         $ops->[$i++]->($self, $k);
@@ -956,25 +1053,14 @@ sub cmd_uid_fetch ($$$$;@) {
         my ($cb, $ops, $partial) = fetch_compile(\@want);
         return "$tag $cb\r\n" unless $ops;
 
+        # cb is one of fetch_blob, fetch_smsg, fetch_uid
         $range_csv = 'bad' if $range_csv !~ $valid_range;
         my $range_info = range_step($self, \$range_csv);
         return "$tag $range_info\r\n" if !ref($range_info);
+        uo2m_hibernate($self) if $cb == \&fetch_blob; # slow, save RAM
         long_response($self, $cb, $tag, [], $range_info, $ops, $partial);
 }
 
-# returns an arrayref of UIDs, so MSNs can be translated via:
-# $msn2uid->[$MSN-1] => $UID
-sub msn2uid ($) {
-        my ($self) = @_;
-        my $x = $self->{uid_base};
-        $self->{ibx}->over->uid_range($x + 1, $x + UID_SLICE);
-}
-
-sub msn_to_uid_range ($$) {
-        my $msn2uid = $_[0];
-        $_[1] =~ s!([0-9]+)!$msn2uid->[$1 - 1] // ($msn2uid->[-1] + 1)!sge;
-}
-
 sub cmd_fetch ($$$$;@) {
         my ($self, $tag, $range_csv, @want) = @_;
         my $ibx = $self->{ibx} or return "$tag BAD No mailbox selected\r\n";
@@ -986,6 +1072,7 @@ sub cmd_fetch ($$$$;@) {
         msn_to_uid_range(msn2uid($self), $range_csv);
         my $range_info = range_step($self, \$range_csv);
         return "$tag $range_info\r\n" if !ref($range_info);
+        uo2m_hibernate($self) if $cb == \&fetch_blob; # slow, save RAM
         long_response($self, $cb, $tag, [], $range_info, $ops, $partial);
 }
 
@@ -1318,7 +1405,8 @@ sub event_step {
                         $self->write(\"\* BAD request too long\r\n");
                         return $self->close;
                 }
-                $self->do_read($rbuf, LINE_MAX, length($$rbuf)) or return;
+                $self->do_read($rbuf, LINE_MAX, length($$rbuf)) or
+                                return uo2m_hibernate($self);
                 $line = index($$rbuf, "\n");
         }
         $line = substr($$rbuf, 0, $line + 1, '');