// Copyright (C) all contributors // License: GPL-2.0+ // This file is only intended to be included by xap_helper.h // it implements pieces used by CodeSearchIdx.pm static void term_length_extract(struct req *req) { req->lenv = (size_t *)calloc(req->pfxc, sizeof(size_t)); if (!req->lenv) EABORT("lenv = calloc(%d %zu)", req->pfxc, sizeof(size_t)); for (int i = 0; i < req->pfxc; i++) { char *pfx = req->pfxv[i]; // extract trailing digits as length: // $len = s/([0-9]+)\z// ? ($1+0) : 0 for (size_t j = 0; pfx[j]; j++) { if (pfx[j] < '0' || pfx[j] > '9') continue; if (j == 0) { warnx("W: `%s' not a valid prefix", pfx); continue; } char *end; unsigned long long tmp = strtoull(pfx + j, &end, 10); if (*end || tmp >= (unsigned long long)SIZE_MAX) { warnx("W: `%s' not recognized", pfx); } else { req->lenv[i] = (size_t)tmp; pfx[j] = 0; break; } } } } static void dump_ibx_term(struct req *req, int p, Xapian::Document *doc, const char *ibx_id) { Xapian::TermIterator cur = doc->termlist_begin(); Xapian::TermIterator end = doc->termlist_end(); const char *pfx = req->pfxv[p]; size_t pfx_len = strlen(pfx); size_t term_len = req->lenv[p]; for (cur.skip_to(pfx); cur != end; cur++) { std::string tn = *cur; if (!starts_with(&tn, pfx, pfx_len)) break; if (term_len > 0 && (tn.length() - pfx_len) != term_len) continue; fprintf(req->fp[0], "%s %s\n", tn.c_str() + pfx_len, ibx_id); ++req->nr_out; } } static enum exc_iter dump_ibx_iter(struct req *req, const char *ibx_id, Xapian::MSetIterator *i) { try { Xapian::Document doc = i->get_document(); for (int p = 0; p < req->pfxc; p++) dump_ibx_term(req, p, &doc, ibx_id); } catch (const Xapian::DatabaseModifiedError & e) { req->srch->db->reopen(); return ITER_RETRY; } catch (const Xapian::DocNotFoundError & e) { // oh well... warnx("doc not found: %s", e.get_description().c_str()); } return ITER_OK; } static bool cmd_dump_ibx(struct req *req) { if ((optind + 1) >= req->argc) ABORT("usage: dump_ibx [OPTIONS] IBX_ID QRY_STR"); if (!req->pfxc) ABORT("dump_ibx requires -A PREFIX"); const char *ibx_id = req->argv[optind]; if (my_setlinebuf(req->fp[0])) // for sort(1) pipe EABORT("setlinebuf(fp[0])"); // WTF? req->asc = true; req->sort_col = -1; term_length_extract(req); Xapian::MSet mset = mail_mset(req, req->argv[optind + 1]); // @UNIQ_FOLD in CodeSearchIdx.pm can handle duplicate lines fine // in case we need to retry on DB reopens for (Xapian::MSetIterator i = mset.begin(); i != mset.end(); i++) { for (int t = 10; t > 0; --t) switch (dump_ibx_iter(req, ibx_id, &i)) { case ITER_OK: t = 0; break; // leave inner loop case ITER_RETRY: break; // continue for-loop case ITER_ABORT: return false; // error } } emit_mset_stats(req, &mset); return true; } struct dump_roots_tmp { struct stat sb; void *mm_ptr; char **entries; struct fbuf wbuf; int root2off_fd; }; #define CLEANUP_DUMP_ROOTS __attribute__((__cleanup__(dump_roots_ensure))) static void dump_roots_ensure(void *ptr) { struct dump_roots_tmp *drt = (struct dump_roots_tmp *)ptr; if (drt->root2off_fd >= 0) xclose(drt->root2off_fd); hdestroy(); // idempotent size_t size = off2size(drt->sb.st_size); if (drt->mm_ptr && munmap(drt->mm_ptr, size)) EABORT("BUG: munmap(%p, %zu)", drt->mm_ptr, size); free(drt->entries); fbuf_ensure(&drt->wbuf); } static bool root2offs_str(struct fbuf *root_offs, Xapian::Document *doc) { Xapian::TermIterator cur = doc->termlist_begin(); Xapian::TermIterator end = doc->termlist_end(); ENTRY e, *ep; fbuf_init(root_offs); for (cur.skip_to("G"); cur != end; cur++) { std::string tn = *cur; if (!starts_with(&tn, "G", 1)) break; union { const char *in; char *out; } u; u.in = tn.c_str() + 1; e.key = u.out; ep = hsearch(e, FIND); if (!ep) ABORT("hsearch miss `%s'", e.key); // ep->data is a NUL-terminated string matching /[0-9]+/ fputc(' ', root_offs->fp); fputs((const char *)ep->data, root_offs->fp); } fputc('\n', root_offs->fp); ERR_CLOSE(root_offs->fp, EXIT_FAILURE); // ENOMEM root_offs->fp = NULL; return true; } // writes term values matching @pfx for a given @doc, ending the line // with the contents of @root_offs static void dump_roots_term(struct req *req, int p, struct dump_roots_tmp *drt, struct fbuf *root_offs, Xapian::Document *doc) { Xapian::TermIterator cur = doc->termlist_begin(); Xapian::TermIterator end = doc->termlist_end(); const char *pfx = req->pfxv[p]; size_t pfx_len = strlen(pfx); size_t term_len = req->lenv[p]; for (cur.skip_to(pfx); cur != end; cur++) { std::string tn = *cur; if (!starts_with(&tn, pfx, pfx_len)) break; if (term_len > 0 && (tn.length() - pfx_len) != term_len) continue; fputs(tn.c_str() + pfx_len, drt->wbuf.fp); fwrite(root_offs->ptr, root_offs->len, 1, drt->wbuf.fp); ++req->nr_out; } } // we may have lines which exceed PIPE_BUF, so we do our own // buffering and rely on flock(2), here static bool dump_roots_flush(struct req *req, struct dump_roots_tmp *drt) { bool ok = true; off_t off = ftello(drt->wbuf.fp); if (off < 0) EABORT("ftello"); if (!off) return ok; ERR_FLUSH(drt->wbuf.fp); // ENOMEM int fd = fileno(req->fp[0]); while (flock(drt->root2off_fd, LOCK_EX)) { if (errno == EINTR) continue; err(EXIT_FAILURE, "LOCK_EX"); // ENOLCK? } ok = write_all(fd, &drt->wbuf, (size_t)off); while (flock(drt->root2off_fd, LOCK_UN)) { if (errno == EINTR) continue; err(EXIT_FAILURE, "LOCK_UN"); // ENOLCK? } if (fseeko(drt->wbuf.fp, 0, SEEK_SET)) EABORT("fseeko"); return ok; } static enum exc_iter dump_roots_iter(struct req *req, struct dump_roots_tmp *drt, Xapian::MSetIterator *i) { CLEANUP_FBUF struct fbuf root_offs = {}; // " $ID0 $ID1 $IDx..\n" try { Xapian::Document doc = i->get_document(); if (!root2offs_str(&root_offs, &doc)) return ITER_ABORT; // bad request, abort for (int p = 0; p < req->pfxc; p++) dump_roots_term(req, p, drt, &root_offs, &doc); } catch (const Xapian::DatabaseModifiedError & e) { req->srch->db->reopen(); return ITER_RETRY; } catch (const Xapian::DocNotFoundError & e) { // oh well... warnx("doc not found: %s", e.get_description().c_str()); } return ITER_OK; } static bool cmd_dump_roots(struct req *req) { CLEANUP_DUMP_ROOTS struct dump_roots_tmp drt = {}; drt.root2off_fd = -1; if ((optind + 1) >= req->argc) ABORT("usage: dump_roots [OPTIONS] ROOT2ID_FILE QRY_STR"); if (!req->pfxc) ABORT("dump_roots requires -A PREFIX"); const char *root2off_file = req->argv[optind]; drt.root2off_fd = open(root2off_file, O_RDONLY); if (drt.root2off_fd < 0) EABORT("open(%s)", root2off_file); if (fstat(drt.root2off_fd, &drt.sb)) // ENOMEM? err(EXIT_FAILURE, "fstat(%s)", root2off_file); // each entry is at least 43 bytes ({OIDHEX}\0{INT}\0), // so /32 overestimates the number of expected entries by // ~%25 (as recommended by Linux hcreate(3) manpage) size_t size = off2size(drt.sb.st_size); size_t est = (size / 32) + 1; //+1 for "\0" termination drt.mm_ptr = mmap(NULL, size, PROT_READ, MAP_PRIVATE, drt.root2off_fd, 0); if (drt.mm_ptr == MAP_FAILED) err(EXIT_FAILURE, "mmap(%zu, %s)", size, root2off_file); size_t asize = est * 2; if (asize < est) ABORT("too many entries: %zu", est); drt.entries = (char **)calloc(asize, sizeof(char *)); if (!drt.entries) err(EXIT_FAILURE, "calloc(%zu * 2, %zu)", est, sizeof(char *)); size_t tot = split2argv(drt.entries, (char *)drt.mm_ptr, size, asize); if (tot <= 0) return false; // split2argv already warned on error if (!hcreate(est)) err(EXIT_FAILURE, "hcreate(%zu)", est); for (size_t i = 0; i < tot; ) { ENTRY e; e.key = hsearch_enter_key(drt.entries[i++]); // dies on ENOMEM e.data = drt.entries[i++]; if (!hsearch(e, ENTER)) err(EXIT_FAILURE, "hsearch(%s => %s, ENTER)", e.key, (const char *)e.data); } req->asc = true; req->sort_col = -1; Xapian::MSet mset = commit_mset(req, req->argv[optind + 1]); term_length_extract(req); fbuf_init(&drt.wbuf); // @UNIQ_FOLD in CodeSearchIdx.pm can handle duplicate lines fine // in case we need to retry on DB reopens for (Xapian::MSetIterator i = mset.begin(); i != mset.end(); i++) { for (int t = 10; t > 0; --t) switch (dump_roots_iter(req, &drt, &i)) { case ITER_OK: t = 0; break; // leave inner loop case ITER_RETRY: break; // continue for-loop case ITER_ABORT: return false; // error } if (!(req->nr_out & 0x3fff) && !dump_roots_flush(req, &drt)) return false; } if (!dump_roots_flush(req, &drt)) return false; emit_mset_stats(req, &mset); return true; }