From 45319ccf605ae037ac8bfdcf6591111d9020d08a Mon Sep 17 00:00:00 2001 From: Stefan Majewsky Date: Sun, 18 Apr 2021 14:33:13 +0200 Subject: [PATCH] shave 4 bytes off the Gloss and LoanwordSource representations This saves 5.29 MiB of payload when compiling with `--features full`. The total payload in that configuration is now 66.72 MiB (breaking down into 30.75 MiB raw text plus 35.97 MiB metadata). --- build.rs | 19 +++++++++++-------- src/lib.rs | 4 ++-- src/payload.rs | 20 ++++++++++---------- 3 files changed, 23 insertions(+), 20 deletions(-) diff --git a/build.rs b/build.rs index 476d53e..a931c36 100644 --- a/build.rs +++ b/build.rs @@ -277,7 +277,7 @@ impl ToPayload for jmdict_traverse::RawSense<'_> { impl ToPayload for jmdict_traverse::RawLSource<'_> { fn size() -> usize { - 5 + 4 } fn encode_one(&self, omni: &mut OmniBuffer, buf: &mut [u32]) { @@ -287,26 +287,29 @@ impl ToPayload for jmdict_traverse::RawLSource<'_> { let r = omni.push_str(self.lang); buf[2] = r.start; buf[3] = r.end; - buf[4] = 0; + //`omni.text` is significantly shorter than 2^28 bytes, so we can shove those two booleans + //into the highest bits of one of the offset values if self.is_partial { - buf[4] |= 0x1; + buf[0] |= 0x10000000; } if self.is_wasei { - buf[4] |= 0x2; + buf[0] |= 0x20000000; } } } impl ToPayload for jmdict_traverse::RawGloss<'_> { fn size() -> usize { - 3 + 2 } fn encode_one(&self, omni: &mut OmniBuffer, buf: &mut [u32]) { + //`omni.text` is never larger than 30-40 MiB. That's slightly more than 2^24 bytes, but + //comfortably below 2^28 bytes. We can therefore use the upper 4 bits of `buf[0]` and + //`buf[1]`, respectively, to encode `self.lang` and `self.g_type`. let r = omni.push_str(self.text); - buf[0] = r.start; - buf[1] = r.end; - buf[2] = self.lang.to_u32() | (self.g_type.to_u32() << 16); + buf[0] = r.start | (self.lang.to_u32() << 28); + buf[1] = r.end | (self.g_type.to_u32() << 28); } } diff --git a/src/lib.rs b/src/lib.rs index 6cafe95..3998d51 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -341,9 +341,9 @@ wrap_iterator!(&'static str, 2, Strings); wrap_iterator!(PartOfSpeech, 1, PartsOfSpeech); wrap_iterator!(SenseTopic, 1, SenseTopics); wrap_iterator!(SenseInfo, 1, SenseInfos); -wrap_iterator!(LoanwordSource, 5, LoanwordSources); +wrap_iterator!(LoanwordSource, 4, LoanwordSources); wrap_iterator!(Dialect, 1, Dialects); -wrap_iterator!(Gloss, 3, Glosses); +wrap_iterator!(Gloss, 2, Glosses); ///An iterator providing fast access to objects in the database. Instances of this iterator ///can be copied cheaply. diff --git a/src/payload.rs b/src/payload.rs index e8a5d1a..f917df7 100644 --- a/src/payload.rs +++ b/src/payload.rs @@ -167,13 +167,13 @@ impl FromPayload<1> for SenseInfo { } } -impl FromPayload<5> for LoanwordSource { - fn get(data: &[u32; 5]) -> Self { +impl FromPayload<4> for LoanwordSource { + fn get(data: &[u32; 4]) -> Self { Self { - text: get_str(data[0], data[1]), + text: get_str(data[0] & 0x0FFFFFFF, data[1]), language: get_str(data[2], data[3]), - is_partial: (data[4] & 0x1) == 0x1, - is_wasei: (data[4] & 0x2) == 0x2, + is_partial: (data[0] & 0x10000000) == 0x10000000, + is_wasei: (data[0] & 0x20000000) == 0x20000000, } } } @@ -184,12 +184,12 @@ impl FromPayload<1> for Dialect { } } -impl FromPayload<3> for Gloss { - fn get(data: &[u32; 3]) -> Self { - let lang_code = data[2] & 0x0000FFFF; - let type_code = (data[2] & 0xFFFF0000) >> 16; +impl FromPayload<2> for Gloss { + fn get(data: &[u32; 2]) -> Self { + let lang_code = (data[0] & 0xF0000000) >> 28; + let type_code = (data[1] & 0xF0000000) >> 28; Gloss { - text: get_str(data[0], data[1]), + text: get_str(data[0] & 0x0FFFFFFF, data[1] & 0x0FFFFFFF), language: jmdict_enums::EnumPayload::from_u32(lang_code), gloss_type: jmdict_enums::EnumPayload::from_u32(type_code), }