From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: AS200651 185.100.86.0/24 X-Spam-Status: No, score=-2.6 required=3.0 tests=BAYES_00,RCVD_IN_MSPIKE_BL, RCVD_IN_MSPIKE_ZBI,RCVD_IN_XBL,SPF_FAIL,SPF_HELO_FAIL,TO_EQ_FM_DOM_SPF_FAIL shortcircuit=no autolearn=no autolearn_force=no version=3.4.0 Received: from 80x24.org (torsrv0.snydernet.net [185.100.86.154]) by dcvr.yhbt.net (Postfix) with ESMTP id 0E11320357 for ; Wed, 12 Jul 2017 22:32:28 +0000 (UTC) From: Eric Wong To: spew@80x24.org Subject: [PATCH] hash: keep fstrings of tainted strings for string keys Date: Wed, 12 Jul 2017 22:32:26 +0000 Message-Id: <20170712223226.20939-1-e@80x24.org> List-Id: The same hash keys may be loaded from tainted data sources frequently. If a non-tainted fstring already exists (because the application expects the hash key), cache and deduplicate the tainted version in the new tainted_frozen_strings table. For non-embedded strings, this also allows sharing with the underlying malloc-ed data. * vm_core.h (rb_vm_struct): add tainted_frozen_strings * vm.c (ruby_vm_destruct): free tainted_frozen_strings (Init_vm_objects): initialize tainted_frozen_strings (rb_vm_tfstring_table): accessor for tainted_frozen_strings * internal.h: declare rb_fstring_existing, rb_vm_tfstring_table * hash.c (fstring_existing_str): remove (moved to string.c) (hash_aset_str): use rb_fstring_existing * string.c (rb_fstring_existing): new, based on fstring_existing_str (tainted_fstr_update): new (rb_fstring_existing0): new, based on fstring_existing_str (rb_tainted_fstring_existing): new, special case for tainted strings (rb_str_free): delete from tainted_frozen_strings table * test/ruby/test_optimization.rb (test_hash_reuse_fstring): new test --- hash.c | 23 +---------- internal.h | 3 +- string.c | 94 ++++++++++++++++++++++++++++++++++++++++++ test/ruby/test_optimization.rb | 31 ++++++++++++++ vm.c | 12 ++++++ vm_core.h | 1 + 6 files changed, 141 insertions(+), 23 deletions(-) diff --git a/hash.c b/hash.c index 7a0beb7225..e17d906e9b 100644 --- a/hash.c +++ b/hash.c @@ -18,7 +18,6 @@ #include "probes.h" #include "id.h" #include "symbol.h" -#include "gc.h" #ifdef __APPLE__ # ifdef HAVE_CRT_EXTERNS_H @@ -1516,33 +1515,13 @@ hash_aset(st_data_t *key, st_data_t *val, struct update_arg *arg, int existing) return ST_CONTINUE; } -static VALUE -fstring_existing_str(VALUE str) -{ - st_data_t fstr; - st_table *tbl = rb_vm_fstring_table(); - - if (st_lookup(tbl, str, &fstr)) { - if (rb_objspace_garbage_object_p(fstr)) { - return rb_fstring(str); - } - else { - return (VALUE)fstr; - } - } - else { - return Qnil; - } -} - static int hash_aset_str(st_data_t *key, st_data_t *val, struct update_arg *arg, int existing) { if (!existing && !RB_OBJ_FROZEN(*key)) { VALUE k; - if (!RB_OBJ_TAINTED(*key) && - (k = fstring_existing_str(*key)) != Qnil) { + if ((k = rb_fstring_existing(*key)) != Qnil) { *key = k; } else { diff --git a/internal.h b/internal.h index 81f4d2b91d..e6d50ebcda 100644 --- a/internal.h +++ b/internal.h @@ -1574,6 +1574,7 @@ VALUE rb_strftime(const char *format, size_t format_len, rb_encoding *enc, /* string.c */ VALUE rb_fstring(VALUE); VALUE rb_fstring_new(const char *ptr, long len); +VALUE rb_fstring_existing(VALUE); #define rb_fstring_lit(str) rb_fstring_new((str), rb_strlen_lit(str)) #define rb_fstring_literal(str) rb_fstring_lit(str) VALUE rb_fstring_cstr(const char *str); @@ -1726,7 +1727,7 @@ void rb_vm_check_redefinition_by_prepend(VALUE klass); VALUE rb_yield_refine_block(VALUE refinement, VALUE refinements); VALUE ruby_vm_special_exception_copy(VALUE); PUREFUNC(st_table *rb_vm_fstring_table(void)); - +PUREFUNC(st_table *rb_vm_tfstring_table(void)); /* vm_dump.c */ void rb_print_backtrace(void); diff --git a/string.c b/string.c index 2012a281d6..72fb65087c 100644 --- a/string.c +++ b/string.c @@ -349,6 +349,99 @@ register_fstring(VALUE str) return ret; } +static int +tainted_fstr_update(st_data_t *key, st_data_t *val, st_data_t arg, int existing) +{ + VALUE *fstr = (VALUE *)arg; + VALUE str = (VALUE)*key; + + if (existing) { + /* because of lazy sweep, str may be unmarked already and swept + * at next time */ + if (rb_objspace_garbage_object_p(str)) { + *fstr = Qundef; + return ST_DELETE; + } + + *fstr = str; + return ST_STOP; + } + else { + str = rb_str_resurrect(str); + RB_OBJ_TAINT_RAW(str); + RB_FL_SET_RAW(str, RSTRING_FSTR); + RB_OBJ_FREEZE_RAW(str); + + *key = *val = *fstr = str; + return ST_CONTINUE; + } +} + +static VALUE +rb_fstring_existing0(VALUE str) +{ + st_table *frozen_strings = rb_vm_fstring_table(); + st_data_t fstr; + + if (st_lookup(frozen_strings, str, &fstr)) { + if (rb_objspace_garbage_object_p(fstr)) { + return register_fstring(str); + } + else { + return (VALUE)fstr; + } + } + return Qnil; +} + +static VALUE +rb_tainted_fstring_existing(VALUE str) +{ + VALUE ret; + st_data_t fstr; + st_table *tfstrings = rb_vm_tfstring_table(); + + if (st_lookup(tfstrings, str, &fstr)) { + ret = (VALUE)fstr; + if (!rb_objspace_garbage_object_p(ret)) { + return ret; + } + } + ret = rb_fstring_existing0(str); + if (NIL_P(ret)) { + return Qnil; + } + if (!RB_FL_TEST_RAW(ret, RSTRING_FSTR)) { + return Qnil; + } + do { + fstr = (st_data_t)ret; + st_update(tfstrings, fstr, tainted_fstr_update, (st_data_t)&fstr); + } while ((VALUE)fstr == Qundef); + + ret = (VALUE)fstr; + assert(OBJ_FROZEN_RAW(ret)); + assert(!FL_TEST_RAW(ret, STR_FAKESTR)); + assert(!FL_TEST_RAW(ret, FL_EXIVAR)); + assert(FL_TEST_RAW(ret, RSTRING_FSTR)); + assert(FL_TEST_RAW(ret, FL_TAINT)); + assert(RBASIC_CLASS(ret) == rb_cString); + + return ret; +} + +VALUE +rb_fstring_existing(VALUE str) +{ + if (FL_TEST_RAW(str, RSTRING_FSTR)) + return str; + + if (!RB_OBJ_TAINTED_RAW(str)) + return rb_fstring_existing0(str); + + return rb_tainted_fstring_existing(str); +} + static VALUE setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx) { @@ -1311,6 +1404,7 @@ rb_str_free(VALUE str) if (FL_TEST(str, RSTRING_FSTR)) { st_data_t fstr = (st_data_t)str; st_delete(rb_vm_fstring_table(), &fstr, NULL); + st_delete(rb_vm_tfstring_table(), &fstr, NULL); RB_DEBUG_COUNTER_INC(obj_str_fstr); } diff --git a/test/ruby/test_optimization.rb b/test/ruby/test_optimization.rb index 2d0eec02fd..13c1a8cee2 100644 --- a/test/ruby/test_optimization.rb +++ b/test/ruby/test_optimization.rb @@ -181,6 +181,37 @@ def test_hash_empty? assert_redefine_method('Hash', 'empty?', 'assert_nil({}.empty?); assert_nil({1=>1}.empty?)') end + def test_hash_reuse_fstring + embed = -'h e l l o' + shared = -'this string is too long to be embedded and should be shared' + [ embed, shared ].each do |exp| + key = exp.split(' ').join(' ') + h = {} + h[key] = true + assert_not_same key, h.keys[0] + assert_predicate h.keys[0], :frozen? + assert_same exp, h.keys[0] + + h1 = {} + h2 = {} + key.taint + h1[key] = true + h2[key] = true + k1 = h1.keys[0] + k2 = h2.keys[0] + assert_same k1, k2 + assert_predicate k1, :tainted? + + assert_equal GC::INTERNAL_CONSTANTS[:RVALUE_SIZE], + ObjectSpace.memsize_of(k1) + + if exp == shared + assert_not_equal GC::INTERNAL_CONSTANTS[:RVALUE_SIZE], + ObjectSpace.memsize_of(exp) + end + end + end + def test_hash_aref_with h = { "foo" => 1 } assert_equal 1, h["foo"] diff --git a/vm.c b/vm.c index 814f8b6780..75ca12f2a5 100644 --- a/vm.c +++ b/vm.c @@ -2206,6 +2206,10 @@ ruby_vm_destruct(rb_vm_t *vm) st_free_table(vm->frozen_strings); vm->frozen_strings = 0; } + if (vm->tainted_frozen_strings) { + st_free_table(vm->tainted_frozen_strings); + vm->tainted_frozen_strings = 0; + } rb_vm_gvl_destroy(vm); if (objspace) { rb_objspace_free(objspace); @@ -3142,6 +3146,8 @@ Init_vm_objects(void) vm->mark_object_ary = rb_ary_tmp_new(128); vm->loading_table = st_init_strtable(); vm->frozen_strings = st_init_table_with_size(&rb_fstring_hash_type, 1000); + vm->tainted_frozen_strings = + st_init_table_with_size(&rb_fstring_hash_type, 1000); } /* top self */ @@ -3203,6 +3209,12 @@ rb_vm_fstring_table(void) return GET_VM()->frozen_strings; } +st_table * +rb_vm_tfstring_table(void) +{ + return GET_VM()->tainted_frozen_strings; +} + #if VM_COLLECT_USAGE_DETAILS #define HASH_ASET(h, k, v) rb_hash_aset((h), (st_data_t)(k), (st_data_t)(v)) diff --git a/vm_core.h b/vm_core.h index c23f50690e..fce3cd4128 100644 --- a/vm_core.h +++ b/vm_core.h @@ -574,6 +574,7 @@ typedef struct rb_vm_struct { VALUE *defined_strings; st_table *frozen_strings; + st_table *tainted_frozen_strings; /* params */ struct { /* size in byte */ -- EW