reset history

There is about four weeks worth of history, the interesting parts of
which I've documented in `CONTRIBUTING.md`. I'm now throwing this
history away because there is a lot of messing with data files in there
that bloats the repo unnecessarily, and this is my last chance to get
rid of that bloat before other people start pulling it.
main
Stefan Majewsky 3 years ago
commit 5ceeec3acc

@ -0,0 +1,48 @@
name: tests
on:
push:
branches: [ main ]
pull_request:
branches: [ main ]
env:
CARGO_TERM_COLOR: always
jobs:
build:
runs-on: ubuntu-latest
strategy:
matrix:
featureset:
# common builds
- '--features full'
- '--features default'
- '--features scope-uncommon'
- '--features scope-uncommon,scope-archaic'
# development builds
- '--features db-minimal'
- '--features db-minimal,scope-uncommon'
- '--features db-minimal,scope-uncommon,scope-archaic'
# builds without English glosses
- '--no-default-features --features translations-dut'
- '--no-default-features --features translations-fre'
- '--no-default-features --features translations-ger'
- '--no-default-features --features translations-hun'
- '--no-default-features --features translations-rus'
- '--no-default-features --features translations-slv'
- '--no-default-features --features translations-spa'
- '--no-default-features --features translations-swe'
- '--no-default-features --features translations-dut,scope-uncommon,scope-archaic'
- '--no-default-features --features translations-fre,scope-uncommon,scope-archaic'
- '--no-default-features --features translations-ger,scope-uncommon,scope-archaic'
- '--no-default-features --features translations-hun,scope-uncommon,scope-archaic'
- '--no-default-features --features translations-rus,scope-uncommon,scope-archaic'
- '--no-default-features --features translations-slv,scope-uncommon,scope-archaic'
- '--no-default-features --features translations-spa,scope-uncommon,scope-archaic'
- '--no-default-features --features translations-swe,scope-uncommon,scope-archaic'
steps:
- uses: actions/checkout@v2
- name: build
run: cargo build --verbose ${{ matrix.featureset }}
- name: test
run: cargo test --verbose ${{ matrix.featureset }}

3
.gitignore vendored

@ -0,0 +1,3 @@
target/
data/entrypack-*.json.gz
Cargo.lock

@ -0,0 +1,153 @@
# Contributing
Issues and PRs are welcome, but I cannot give any specific guarantees how fast I'll get to them. Some specific remarks:
* If the copy of JMdict is outdated and you need a newer version, open an issue and I'll make a new release with a newer
copy for you. Please don't send a PR for this; I have no way to verify the diff and will do the import myself anyway.
# Explanations
## Payload structure
The obvious idea would be to have `build.rs` generate a bunch of code like this...
```rust
// many fields elided for brevity
static ENTRIES: &[Entry] = &[
Entry {
sequence_number: 1000150,
kanji_elements: &[
KanjiElement {
text: "RS232ケーブル",
},
],
reading_elements: &[
ReadingElement {
text: "アールエスにさんにケーブル",
},
],
senses: &[
Sense {
parts_of_speech: &[
PartOfSpeech::Noun,
],
glosses: &[
Gloss {
text: "rs232 cable",
},
],
},
],
},
...
];
```
...and just `include!()` it into the main binary. The problem with this is that each `&[T]` or `&str` is its own
relocatable object that the linker has to deal with, so compile times, link times and binary size are absurdly high.
I initially optimized this by putting the all strings into one giant string, somewhat like this:
```rust
//This actually comes from an include_str!().
static ALL_TEXT: &str = "ケーブルアールエスにさんにケーブルrs232 cable...";
static ENTRIES: &[EntryRepr] = &[
...,
Entry {
sequence_number: 1000150,
kanji_elements: &[
KanjiElementRepr {
text: StringRef { start: 0, end: 27 },
},
],
reading_elements: &[
ReadingElementRepr {
text: StringRef { start: 27, end: 66 },
},
],
senses: &[
SenseRepr {
parts_of_speech: &[
PartOfSpeech::Noun,
],
glosses: &[
GlossRepr {
text: StringRef { start: 66, end: 77 },
},
],
},
],
},
...
];
```
This helps with the `&str` objects, but there is still the various cascaded `&[T]`. I applied the same technique to
those as well:
```rust
static ALL_TEXT: &str = "ケーブルアールエスにさんにケーブルrs232 cable...";
static ALL_K_ELE: &[KanjiElementRepr] = &[
KanjiElementRepr {
text: StringRef { start: 0, end: 27 },
},
...
];
static ALL_R_ELE: &[ReadingElementRepr] = &[
ReadingElementRepr {
text: StringRef { start: 27, end: 66 },
},
...
];
static ALL_POS: &[PartOfSpeech] = &[
PartOfSpeech::Noun,
...
];
static ALL_GLOSSES: &[GlossRepr] = &[
GlossRepr {
text: StringRef { start: 66, end: 77 },
},
...
];
static ALL_SENSES: &[SenseRepr] = &[
SenseRepr {
parts_of_speech: ArrayRef { start: 0, end: 1 },
glosses: ArrayRef { start: 0, end: 1 },
},
...
];
static ALL_ENTRIES: &[EntryRepr] = &[
EntryRepr {
kanji_elements: ArrayRef { start: 0, end: 1 },
reading_elements: ArrayRef { start: 0, end: 1 },
senses: ArrayRef { start: 0, end: 1 },
},
...
];
```
With this and the previous sample, you can see that it's not `Entry` anymore, but `EntryRepr` instead, since those
`StringRef` and `ArrayRef` instances need to be resolved into the things they point to at the API boundary. That's why
the actual exposed types use iterators instead of slice refs for everything: to provide a point where this mapping can
take place.
The structure as described above produces binaries of reasonable size, but because all that generated code needs to be
parsed by the compiler, compile times are still frustratingly slow (on the order of minutes for a full build). And
what's worse, the compiler uses so much working memory that my desktop PC with 16 GiB of RAM went OOM trying to compile
it.
To avoid the need for parsing generated code altogether, I finally replaced all `&[TRepr]` arrays with a single
`static ALL_DATA: &[u32]` that gets imported from a binary file via `include_bytes!()`. u32 was chosen because it is
large enough to index into all relevant structures (both `ALL_TEXT` and `ALL_DATA`). I could have encoded enum variants
as u16, but for now, I prefered the simplicity of having everything in one place and accepted the slight inefficiency in
encoding.
Besides `ALL_TEXT` and `ALL_DATA`, there is one final structure, `static ALL_ENTRY_OFFSETS: &[u32]`, which, as an
entrypoint into the self-referencing structure of `ALL_DATA`, provides the offsets into `ALL_DATA` where entries are
located.

@ -0,0 +1,76 @@
[workspace]
members = [
".",
"jmdict-enums",
"jmdict-traverse",
]
[package]
name = "jmdict"
version = "0.1.0"
authors = [
# implementation
"Stefan Majewsky <majewsky@gmx.net>",
# database contents
"JMdict Contributors",
]
edition = "2018"
description = "The free Japanese dictionary database JMdict, pre-digested for use in Rust libs and apps."
documentation = "https://docs.rs/jmdict/"
readme = "README.md"
homepage = "https://github.com/majewsky/rust-jmdict"
license = "Apache-2.0"
keywords = [ "jmdict", "edict", "edrdg", "japanese", "dictionary" ]
exclude = [
"data/*",
"with-local-entrypack.sh"
]
[dependencies]
align-data = "^0.1.0"
jmdict-enums = { path = "jmdict-enums", version = "0.1.0" }
[build-dependencies]
jmdict-enums = { path = "jmdict-enums", version = "0.1.0" }
jmdict-traverse = { path = "jmdict-traverse", version = "0.1.0" }
[dev-dependencies]
jmdict-traverse = { path = "jmdict-traverse", version = "0.1.0" }
[features]
default = [
"translations-eng",
]
full = [
"scope-uncommon",
"scope-archaic",
"translations-eng",
"translations-dut",
"translations-fre",
"translations-ger",
"translations-hun",
"translations-rus",
"translations-slv",
"translations-spa",
"translations-swe",
]
scope-uncommon = []
scope-archaic = ["jmdict-enums/scope-archaic"]
translations-eng = ["jmdict-enums/translations-eng"]
translations-dut = ["jmdict-enums/translations-dut"]
translations-fre = ["jmdict-enums/translations-fre"]
translations-ger = ["jmdict-enums/translations-ger"]
translations-hun = ["jmdict-enums/translations-hun"]
translations-rus = ["jmdict-enums/translations-rus"]
translations-slv = ["jmdict-enums/translations-slv"]
translations-spa = ["jmdict-enums/translations-spa"]
translations-swe = ["jmdict-enums/translations-swe"]
# WARNING: These produce a broken build. Read the module-level docs before proceeding.
db-empty = []
db-minimal = []
[package.metadata.docs.rs]
all-features = true

@ -0,0 +1,212 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
--------------------------------------------------------------------------------
Note:
Individual files contain the following tag instead of the full license text.
SPDX-License-Identifier: Apache-2.0
This enables machine processing of license information based on the SPDX
License Identifiers that are available at: https://spdx.org/licenses/

@ -0,0 +1,49 @@
# WARNING: Licensing on database files
The database files compiled into the crate are licensed from the Electronic Dictionary Research and Development Group
under Creative Commons licenses. Applications linking this crate directly oder indirectly must display appropriate
copyright notices. Please refer to the [EDRDG's license statement](https://www.edrdg.org/edrdg/licence.html) for details.
# rust-jmdict
![GitHub Actions Badge](https://github.com/majewsky/rust-jmdict/actions/workflows/test.yml/badge.svg)
The `jmdict` crate contains the data from the [JMDict file](https://www.edrdg.org/jmdict/j_jmdict.html), a comprehensive
multilingual dictionary of the Japanese language. The original JMDict file, included in this repository (and hence, in
releases of this crate) comes as XML. Instead of stuffing the XML in the binary directly, this crate parses the XML at
compile-time and generates an optimized representation for inclusion in the final binary.
In short, this crate does:
* parse the XML structure of the JMdict database file,
* provide an API to access its entries, and
* provide compile-time flags (via Cargo features) to select the amount of information included in the binary.
This crate does NOT:
* provide fast lookup into the database. You get a list of entries and then you can build your own indexing on top as
required by your application.
For specific examples, please check out the [documentation on docs.rs](https://docs.rs/jmdict/).
## Building
When packaging to crates.io, we cannot include the actual payload data (`data/entrypack.json`) because crates.io has a
limit of 10 MiB per crate. (Technically, we could ship the data by depending on a series of data crates each slightly
under 10 MiB, but I intend to be a good citizen and not abuse the shared infrastructure of crates.io needlessly.)
Hence the default strategy is to pull the entrypack (a preprocessed form of the JMdict contents) at build time from
a server under the crate owner's control, currently <https://dl.xyrillian.de/jmdict/>. Each released crate version will
have the most recent entrypack (as of the time of publication) hardcoded into its code, along with a SHA-256 checksum to
ensure data integrity.
If downloading the entrypack at build time is not possible (e.g. because the build machine does not have internet
access, or because `curl` is not installed on the build machine), download the entrypack beforehand and put its path in
the `RUST_JMDICT_ENTRYPACK` environment variable when running `cargo build`.
For development purposes, when building from the repository, `data/entrypack.json` will be used instead. If this is not
desired, set the value of the `RUST_JMDICT_ENTRYPACK` to `default` to force the normal download behavior.
## Contributing
If you plan to open issues or write code, please have a look at [CONTRIBUTING.md](CONTRIBUTING.md).

@ -0,0 +1,333 @@
/*******************************************************************************
* Copyright 2021 Stefan Majewsky <majewsky@gmx.net>
* SPDX-License-Identifier: Apache-2.0
* Refer to the file "LICENSE" for details.
*******************************************************************************/
#[cfg(not(any(
feature = "translations-eng",
feature = "translations-dut",
feature = "translations-fre",
feature = "translations-ger",
feature = "translations-hun",
feature = "translations-rus",
feature = "translations-slv",
feature = "translations-spa",
feature = "translations-swe"
)))]
compile_error!("no target languages selected (select at least one \"translations-XXX\" feature)");
use jmdict_enums::*;
use std::convert::TryInto;
use std::io::Write;
fn main() {
println!("cargo:rerun-if-changed=build.rs");
let opts = jmdict_traverse::Options {
is_db_minimal: cfg!(feature = "db-minimal"),
with_uncommon: cfg!(feature = "scope-uncommon"),
with_archaic: cfg!(feature = "scope-archaic"),
};
let mut omni: OmniBuffer = Default::default();
if cfg!(not(feature = "db-empty")) {
jmdict_traverse::process_dictionary(&mut omni, opts);
}
write_u32s(&path_to("entry_offsets.dat"), &omni.entry_offsets);
write_u32s(&path_to("payload.dat"), &omni.data);
std::fs::write(&path_to("strings.txt"), &omni.text).unwrap();
}
fn path_to(filename: &str) -> std::path::PathBuf {
let out_dir = std::env::var_os("OUT_DIR").unwrap();
std::path::Path::new(&out_dir).join(filename)
}
fn write_u32s(path: &std::path::Path, vals: &[u32]) {
let f = std::fs::File::create(&path).unwrap();
let mut f = std::io::BufWriter::new(f);
for val in vals {
f.write_all(&val.to_ne_bytes()).unwrap();
}
}
///Helper type for references into OmniBuffer::data or OmniBuffer::text.
///Gets constructed as `(start, end).into()` in the respective OmniBuffer methods.
struct StoredRef {
start: u32,
end: u32,
}
impl From<(usize, usize)> for StoredRef {
fn from(val: (usize, usize)) -> Self {
let (start, end) = val;
let start = start.try_into().unwrap();
let end = end.try_into().unwrap();
Self { start, end }
}
}
///Buffer where all payload gets accumulated before being written into the generated data files.
///Check the explanations in CONTRIBUTING.md for how this works, and why it was built this way.
#[derive(Default)]
struct OmniBuffer {
entry_offsets: Vec<u32>,
data: Vec<u32>,
text: String,
}
impl OmniBuffer {
pub fn push_str(&mut self, text: &str) -> StoredRef {
//optimization: empty text doesn't require any work
if text.is_empty() {
return (0, 0).into();
}
let start = self.text.len();
self.text.push_str(text);
let end = self.text.len();
(start, end).into()
}
pub fn push_data(&mut self, data: &[u32]) -> StoredRef {
//optimization: empty arrays don't require any work
if data.is_empty() {
return (0, 0).into();
}
let start = self.data.len();
self.data.extend(data);
(start, start + data.len()).into()
}
pub fn push_array<T: ToPayload>(&mut self, data: &[T]) -> StoredRef {
//optimization: empty arrays don't require any work
if data.is_empty() {
return (0, 0).into();
}
//render all items into a contiguous Vec<u32>
let size = T::size();
let mut repr = vec![0u32; data.len() * size];
for (idx, elem) in data.iter().enumerate() {
elem.encode_one(self, &mut repr[(idx * size)..((idx + 1) * size)]);
}
self.push_data(&repr)
}
}
impl jmdict_traverse::Visitor for OmniBuffer {
fn notify_data_file_path(&mut self, path: &str) {
println!("cargo:rerun-if-changed={}", &path);
}
fn process_entry(&mut self, entry: &jmdict_traverse::RawEntry) {
let size = jmdict_traverse::RawEntry::size();
let mut repr = vec![0u32; size];
entry.encode_one(self, &mut repr);
let r = self.push_data(&repr);
self.entry_offsets.push(r.start);
}
}
//Like omni.push_array(), but does not push the resulting array just yet.
fn push_array<T: ToPayload>(buf: &mut Vec<u32>, omni: &mut OmniBuffer, array: &[T]) -> u32 {
if !array.is_empty() {
let size = T::size();
let mut repr = vec![0u32; array.len() * size];
for (idx, elem) in array.iter().enumerate() {
elem.encode_one(omni, &mut repr[(idx * size)..((idx + 1) * size)]);
}
buf.extend(repr);
}
buf.len() as u32
}
///Helper trait for encoding types from the jmdict-traverse crate into a sequence of u32 for
///embedding in OmniBuffer::data.
trait ToPayload {
///How many u32 are needed to encode one item of this type.
fn size() -> usize;
///Encode one item of this type into the given preallocated buffer of length `Self::size()`.
fn encode_one(&self, omni: &mut OmniBuffer, buf: &mut [u32]);
}
//NOTE: It would be really nice to just do `impl ToPayload for T where T: EnumPayload`, but this
//conflicts with all other `impl ToPayload` under the current specialization rules.
macro_rules! enum_to_payload {
($t:ident) => {
impl ToPayload for $t {
fn size() -> usize {
1
}
fn encode_one(&self, _omni: &mut OmniBuffer, buf: &mut [u32]) {
buf[0] = self.to_u32();
}
}
};
}
enum_to_payload!(KanjiInfo);
enum_to_payload!(ReadingInfo);
enum_to_payload!(PartOfSpeech);
enum_to_payload!(SenseTopic);
enum_to_payload!(SenseInfo);
enum_to_payload!(Dialect);
impl ToPayload for jmdict_traverse::RawEntry<'_> {
fn size() -> usize {
4
}
fn encode_one(&self, omni: &mut OmniBuffer, buf: &mut [u32]) {
//Instead of using `omni.push_array()` on each member and encoding each StoredRef
//separately, we concatenate the payload representations of all member arrays and
//`push_data()` them all at once. We then encode that StoredRef, plus offsets to split the
//encoded array back into its constituents. Since each encoded array is rather short, the
//offsets fit into a single byte, so we can encode both (plus self.ent_seq) in a single u32.
//
//Compared to the naive layout as 3 StoredRef + 1 u32 (28 bytes), we save 12 bytes per Sense.
let mut dbuf = Vec::new();
let offset1 = push_array(&mut dbuf, omni, &self.k_ele);
let offset2 = push_array(&mut dbuf, omni, &self.r_ele);
push_array(&mut dbuf, omni, &self.sense);
let r = omni.push_data(&dbuf);
buf[0] = r.start;
buf[1] = r.end;
buf[2] = offset1 + (offset2 << 16);
buf[3] = self.ent_seq;
}
}
impl ToPayload for jmdict_traverse::RawKanjiElement<'_> {
fn size() -> usize {
5
}
fn encode_one(&self, omni: &mut OmniBuffer, buf: &mut [u32]) {
buf[0] = self.ke_pri.to_u32();
let r = omni.push_str(self.keb);
buf[1] = r.start;
buf[2] = r.end;
let r = omni.push_array(&self.ke_inf);
buf[3] = r.start;
buf[4] = r.end;
}
}
impl ToPayload for jmdict_traverse::RawReadingElement<'_> {
fn size() -> usize {
5
}
fn encode_one(&self, omni: &mut OmniBuffer, buf: &mut [u32]) {
buf[0] = self.re_pri.to_u32();
let r = omni.push_str(self.reb);
buf[1] = r.start;
buf[2] = r.end;
let r = omni.push_array(&self.re_inf);
buf[3] = r.start;
buf[4] = r.end;
}
}
impl ToPayload for jmdict_traverse::RawSense<'_> {
fn size() -> usize {
5
}
fn encode_one(&self, omni: &mut OmniBuffer, buf: &mut [u32]) {
//Instead of using `omni.push_array()` on each member and encoding each StoredRef
//separately, we concatenate the payload representations of all member arrays and
//`push_data()` them all at once. We then encode that StoredRef, plus offsets to split the
//encoded array back into its constituents. Since each encoded array is rather short, the
//offsets fit into a single byte, so we can encode four at a time in a single u32.
//
//Compared to the naive layout as 11 StoredRef (88 bytes), we save 68 bytes per Sense.
let mut dbuf = Vec::new();
let offset1 = push_array(&mut dbuf, omni, &self.stagk);
let offset2 = push_array(&mut dbuf, omni, &self.stagr);
let offset3 = push_array(&mut dbuf, omni, &self.pos);
let offset4 = push_array(&mut dbuf, omni, &self.xref);
let offset5 = push_array(&mut dbuf, omni, &self.ant);
let offset6 = push_array(&mut dbuf, omni, &self.field);
let offset7 = push_array(&mut dbuf, omni, &self.misc);
let offset8 = push_array(&mut dbuf, omni, &self.s_inf);
let offset9 = push_array(&mut dbuf, omni, &self.lsource);
let offset10 = push_array(&mut dbuf, omni, &self.dial);
push_array(&mut dbuf, omni, &self.gloss);
let r = omni.push_data(&dbuf);
buf[0] = r.start;
buf[1] = r.end;
buf[2] = offset1 + (offset2 << 8) + (offset3 << 16) + (offset4 << 24);
buf[3] = offset5 + (offset6 << 8) + (offset7 << 16) + (offset8 << 24);
buf[4] = offset9 + (offset10 << 8);
}
}
impl ToPayload for jmdict_traverse::RawLSource<'_> {
fn size() -> usize {
5
}
fn encode_one(&self, omni: &mut OmniBuffer, buf: &mut [u32]) {
let r = omni.push_str(self.text);
buf[0] = r.start;
buf[1] = r.end;
let r = omni.push_str(self.lang);
buf[2] = r.start;
buf[3] = r.end;
buf[4] = 0;
if self.is_partial {
buf[4] |= 0x1;
}
if self.is_wasei {
buf[4] |= 0x2;
}
}
}
impl ToPayload for jmdict_traverse::RawGloss<'_> {
fn size() -> usize {
3
}
fn encode_one(&self, omni: &mut OmniBuffer, buf: &mut [u32]) {
let r = omni.push_str(self.text);
buf[0] = r.start;
buf[1] = r.end;
buf[2] = self.lang.to_u32() | (self.g_type.to_u32() << 16);
}
}
impl<'a> ToPayload for &'a str {
fn size() -> usize {
2
}
fn encode_one(&self, omni: &mut OmniBuffer, buf: &mut [u32]) {
let r = omni.push_str(self);
buf[0] = r.start;
buf[1] = r.end;
}
}
impl ToPayload for u32 {
fn size() -> usize {
1
}
fn encode_one(&self, _omni: &mut OmniBuffer, buf: &mut [u32]) {
buf[0] = *self;
}
}

@ -0,0 +1,16 @@
default:
@printf '%s\n' '>> Usage:' ' make import JMDICT_PATH=/path/to/jmdict' ' make export' '>> Refer to README.md for details.'
import:
ifeq ($(origin JMDICT_PATH),undefined)
@echo "ERROR: Run as \`make import JMDICT_PATH=/path/to/JMdict\`".
@false
endif
go run preprocess-jmdict.go $(JMDICT_PATH)
EXPORT_FILENAME ?= entrypack-v1-$(shell cat entrypack.json | grep -o 'Creation Date: [0-9-]*' | awk '{print$$3}').json.gz
export:
gzip -9 < entrypack.json > $(EXPORT_FILENAME)
.PHONY: default import export

@ -0,0 +1,27 @@
# `data/`
We cannot put the JMdict into Git as one single file because its size is over 100 MiB, and GitHub does not like files
that big. I don't want to use LFS because it's still a text file and thus delta-compresses really well if you let Git do
its job. Therefore we split the file into chunks of roughly 1000 entries each.
Since we're pre-processing anyway, we're also converting from XML to JSON. The original XML file uses a lot of memory
when parsed as a whole, and parsing in pieces is finicky because we need to carry over the DTD into each chunk, if only
for the entity definitions. The JSON files in this directory, on the other hand, do not have any magical entities and
thus trivially parse as individual entries. It also turns out that parsing JSON is much quicker than parsing XML, which
makes a significant impact on the build time of the whole crate.
## Import workflow
To update the JMdict copy in this directory, run `make import JMDICT_PATH=/path/to/JMdict`. Check the `git diff`
afterwards; it should usually only show changes for a few places where upstream edited the respective JMdict entries.
## Export workflow
We cannot bundle the data files with the crates when publishing because crates.io imposes a 10 MiB limit on crates. The
data files are therefore stored in a compressed bundle by `make export`. The output file appears in this directory as
`entrypack-YYYY-MM-DD.json.gz`, with the date being extracted from JMdict's own modification timestamp in
`entries-999.json`.
This file can then be copied to its web server location, currently residing on <http://dl.xyrillian.de/jmdict/> under
the control of [@majewsky](https://github.com/majewsky). Finally, update the constants at the top of
`jmdict-traverse/src/file.rs` to refer to the new file.

File diff suppressed because one or more lines are too long

@ -0,0 +1,214 @@
/*******************************************************************************
* Copyright 2021 Stefan Majewsky <majewsky@gmx.net>
* SPDX-License-Identifier: Apache-2.0
* Refer to the file "LICENSE" for details.
*******************************************************************************/
package main
import (
"bufio"
"bytes"
"encoding/json"
"encoding/xml"
"fmt"
"io/ioutil"
"os"
"regexp"
"strings"
)
func main() {
if len(os.Args) != 2 {
fmt.Fprintf(os.Stderr, "usage: %s <path-to-JMdict>\n", os.Args[0])
os.Exit(1)
}
//open input file for line-wise reading
file, err := os.Open(os.Args[1])
must(err)
fileBuffered := bufio.NewReaderSize(file, 65536)
nextLine := func() string {
line, err := fileBuffered.ReadString('\n')
must(err)
return strings.TrimSpace(line)
}
processOpening(nextLine)
processEntries(nextLine)
}
func must(err error) {
if err != nil {
panic(err.Error())
}
}
////////////////////////////////////////////////////////////////////////////////
// process opening (everything until <JMdict>)
var (
entityHeaderRx = regexp.MustCompile(`^<!-- <(\S+)> .*entities -->$`)
entityDefRx = regexp.MustCompile(`^<!ENTITY (\S+) "(.+)">$`)
)
func processOpening(nextLine func() string) {
var (
sets = make(map[string]map[string]string)
currentSet = ""
)
for {
line := nextLine()
//This loop sees all the lines of the DTD up to the opener of the actual
//document contents.
if line == "<JMdict>" {
break
}
//Start a new entity set when encountering its header comment.
match := entityHeaderRx.FindStringSubmatch(line)
if match != nil {
currentSet = match[1]
sets[currentSet] = make(map[string]string)
}
//When inside an entity set, add all subsequent entities to the set.
match = entityDefRx.FindStringSubmatch(line)
if match != nil {
key, value := match[1], match[2]
if currentSet == "" {
panic("entity definition outside of set: " + line)
}
sets[currentSet][key] = value
//the XML decoder also needs to know about these entities, but we have it
//expand the entity into the XML representation of the entity (e.g.
//"&arch;" expands into "arch" rather than "archaism")
decoderEntities[key] = key
}
}
//dump collected data
buf, err := json.Marshal(sets)
must(err)
var indented bytes.Buffer
must(json.Indent(&indented, buf, "", "\t"))
must(ioutil.WriteFile("../jmdict-enums/data/entities.json", indented.Bytes(), 0666))
}
////////////////////////////////////////////////////////////////////////////////
// process contents (everything between <JMdict> and </JMdict>)
func processEntries(nextLine func() string) {
outputFile, err := os.Create("entrypack.json")
must(err)
defer outputFile.Close()
buf := ""
for {
line := nextLine()
//This loop ends when we encounter the end of the file.
if line == "</JMdict>" {
if buf != "" {
//we should have had </entry> just before and thus have an empty buffer
panic("reached </JMdict> with non-empty buffer: " + buf)
}
break
}
//Collect lines until we have a full entry to process.
buf += line
if line == "</entry>" {
_, err := outputFile.Write([]byte(processEntry(buf)))
must(err)
buf = ""
}
}
}
////////////////////////////////////////////////////////////////////////////////
// convert individual entries from XML to JSON
//
// NOTE: In the JSON, common fields have single-letter keys because that
// actually saves several MiB. Going from 90 MiB to 75 MiB is quite
// significant since it gives us more headroom before running into GitHub's hard
// limit of 100 MiB per object.
type dictEntry struct {
SeqNo uint64 `xml:"ent_seq" json:"n"`
KEle []dictKEle `xml:"k_ele" json:"K,omitempty"`
REle []dictREle `xml:"r_ele" json:"R"`
Sense []dictSense `xml:"sense" json:"S"`
}
type dictKEle struct {
Keb string `xml:"keb" json:"t"`
KeInf []string `xml:"ke_inf" json:"i,omitempty"`
KePri []string `xml:"ke_pri" json:"p,omitempty"`
}
type dictREle struct {
Reb string `xml:"reb" json:"t"`
ReNokanji boolByPresence `xml:"re_nokanji" json:"n,omitempty"`
ReRestr []string `xml:"re_restr" json:"r,omitempty"`
ReInf []string `xml:"re_inf" json:"i,omitempty"`
RePri []string `xml:"re_pri" json:"p,omitempty"`
}
type dictSense struct {
Stagk []string `xml:"stagk" json:"stagk,omitempty"`
Stagr []string `xml:"stagr" json:"stagr,omitempty"`
Pos []string `xml:"pos" json:"p,omitempty"`
Xref []string `xml:"xref" json:"xref,omitempty"`
Ant []string `xml:"ant" json:"ant,omitempty"`
Field []string `xml:"field" json:"f,omitempty"`
Misc []string `xml:"misc" json:"m,omitempty"`
SInf []string `xml:"s_inf" json:"i,omitempty"`
Lsource []dictLsource `xml:"lsource" json:"L,omitempty"`
Dial []string `xml:"dial" json:"dial,omitempty"`
Gloss []dictGloss `xml:"gloss" json:"G,omitempty"`
}
type dictLsource struct {
Text string `xml:",chardata" json:"t"`
Lang string `xml:"lang,attr" json:"l,omitempty"`
LsType string `xml:"ls_type,attr" json:"type,omitempty"`
LsWasei string `xml:"ls_wasei,attr" json:"wasei,omitempty"`
}
type dictGloss struct {
Text string `xml:",chardata" json:"t"`
Lang string `xml:"lang,attr" json:"l,omitempty"`
GGend string `xml:"g_gend,attr" json:"g_gend,omitempty"`
GType string `xml:"g_type,attr" json:"g_type,omitempty"`
Pri []string `xml:"pri" json:"pri,omitempty"`
//NOTE: g_gend and <pri> are defined in the DTD, but do not actually occur in any entry.
}
var decoderEntities = make(map[string]string)
func processEntry(xmlStr string) string {
var e dictEntry
dec := xml.NewDecoder(strings.NewReader(xmlStr))
dec.Entity = decoderEntities
must(dec.Decode(&e))
jsonBytes, err := json.Marshal(e)
must(err)
return string(jsonBytes) + "\n"
}
////////////////////////////////////////////////////////////////////////////////
// helper types for XML decoding
//boolByPresence decodes to true when the corresponding element is present.
type boolByPresence bool
func (b *boolByPresence) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error {
//This is only called if the element is present. Otherwise, it stays at its default value of false.
*b = true
//The xml.Decoder will croak unless we consume the element.
var foo struct{}
return d.DecodeElement(&foo, &start)
}

@ -0,0 +1,16 @@
/*******************************************************************************
* Copyright 2021 Stefan Majewsky <majewsky@gmx.net>
* SPDX-License-Identifier: Apache-2.0
* Refer to the file "LICENSE" for details.
*******************************************************************************/
fn main() {
let input = "日曜日";
let count = jmdict::entries()
.filter(|e| {
e.kanji_elements().any(|k| k.text == input)
|| e.reading_elements().any(|r| r.text == input)
})
.count();
println!("{} entries for {}", count, input);
}

@ -0,0 +1,26 @@
[package]
name = "jmdict-enums"
version = "0.1.0"
authors = ["Stefan Majewsky <majewsky@gmx.net>"]
edition = "2018"
description = "Autogenerated enums for the jmdict crate. Do not import directly."
readme = "README.md"
homepage = "https://github.com/majewsky/rust-jmdict/tree/main/jmdict-enums"
license = "Apache-2.0"
[dependencies]
[build-dependencies]
json = "^0.12.0"
[features]
scope-archaic = []
translations-eng = []
translations-dut = []
translations-fre = []
translations-ger = []
translations-hun = []
translations-rus = []
translations-slv = []
translations-spa = []
translations-swe = []

@ -0,0 +1,12 @@
# jmdict-enums
Autogenerated enums for the `jmdict` crate.
This code is in a separate crate because, if we put it in the `jmdict` crate itself, its `build.rs`
could not import it.
## Compatibility promise
**There is none.** This crate can disappear at any time if we choose to restructure the build system
for the `jmdict` crate. To use the types from this crate, look at the re-exports of the same name in
[the `jmdict` crate](https://docs.rs/jmdict/).

@ -0,0 +1,514 @@
/*******************************************************************************
* Copyright 2021 Stefan Majewsky <majewsky@gmx.net>
* SPDX-License-Identifier: Apache-2.0
* Refer to the file "LICENSE" for details.
*******************************************************************************/
use json::JsonValue;
struct EnumVariant {
code: &'static str,
name: &'static str,
enabled: bool,
}
fn v(code: &'static str, name: &'static str) -> EnumVariant {
EnumVariant {
code,
name,
enabled: true,
}
}
impl EnumVariant {
fn when(self, enabled: bool) -> Self {
Self { enabled, ..self }
}
}
struct Enum<'a> {
name: &'static str,
all_name: Option<&'static str>,
doc: String,
entities: Option<&'a JsonValue>,
variants: Vec<EnumVariant>,
}
fn main() {
println!("cargo:rerun-if-changed=build.rs");
println!("cargo:rerun-if-changed=data/entities.json");
let entities_str = std::fs::read_to_string("data/entities.json").unwrap();
let entities = json::parse(&entities_str).unwrap();
let mut content = String::new();
content.push_str(&process(Enum {
name: "Dialect",
all_name: None,
doc: "Dialect of Japanese in which a certain vocabulary occurs.".into(),
entities: Some(&entities["dial"]),
variants: vec![
v("hob", "Hokkaido"),
v("ksb", "Kansai"),
v("ktb", "Kantou"),
v("kyb", "Kyoto"),
v("kyu", "Kyuushuu"),
v("nab", "Nagano"),
v("osb", "Osaka"),
v("rkb", "Ryuukyuu"),
v("thb", "Touhoku"),
v("tsb", "Tosa"),
v("tsug", "Tsugaru"),
],
}));
content.push_str(&process(Enum {
name: "GlossLanguage",
all_name: Some("AllGlossLanguage"),
doc: "The language of a particular Gloss.".into(),
entities: None,
variants: vec![
v("eng", "English").when(cfg!(feature = "translations-eng")),
v("dut", "Dutch").when(cfg!(feature = "translations-dut")),
v("fre", "French").when(cfg!(feature = "translations-fre")),
v("ger", "German").when(cfg!(feature = "translations-ger")),
v("hun", "Hungarian").when(cfg!(feature = "translations-hun")),
v("rus", "Russian").when(cfg!(feature = "translations-rus")),
v("slv", "Slovenian").when(cfg!(feature = "translations-slv")),
v("spa", "Spanish").when(cfg!(feature = "translations-spa")),
v("swe", "Swedish").when(cfg!(feature = "translations-swe")),
],
}));
content.push_str(&process(Enum {
name: "GlossType",
all_name: None,
doc: "Type of gloss.".into(),
entities: None,
variants: vec![
v("", "RegularTranslation"),
v("expl", "Explanation"),
v("fig", "FigurativeSpeech"),
v("lit", "LiteralTranslation"),
],
}));
content.push_str(&process(Enum {
name: "KanjiInfo",
all_name: None,
doc: "Information regarding a certain KanjiElement.".into(),
entities: Some(&entities["ke_inf"]),
variants: vec![
v("ateji", "Ateji"),
v("iK", "IrregularKanjiUsage"),
v("ik", "IrregularKanaUsage"),
v("io", "IrregularOkuriganaUsage"),
v("oK", "OutdatedKanji"),
],
}));
content.push_str(&process(Enum {
name: "PartOfSpeech",
all_name: Some("AllPartOfSpeech"),
doc: "Where a word can appear in a sentence for a particular Sense of the word.".into(),
entities: Some(&entities["pos"]),
variants: vec![
v("adj-f", "NounOrVerbActingPrenominally"),
v("adj-i", "Adjective"),
v("adj-ix", "YoiAdjective"),
v("adj-kari", "KariAdjective").when(cfg!(feature = "scope-archaic")),
v("adj-ku", "KuAdjective").when(cfg!(feature = "scope-archaic")),
v("adj-na", "AdjectivalNoun"),
v("adj-nari", "NariAdjective").when(cfg!(feature = "scope-archaic")),
v("adj-no", "NoAdjective"),
v("adj-pn", "PreNounAdjectival"),
v("adj-shiku", "ShikuAdjective").when(cfg!(feature = "scope-archaic")),
v("adj-t", "TaruAdjective"),
v("adv", "Adverb"),
v("adv-to", "AdverbTakingToParticle"),
v("aux", "Auxiliary"),
v("aux-adj", "AuxiliaryAdjective"),
v("aux-v", "AuxiliaryVerb"),
v("conj", "Conjunction"),
v("cop", "Copula"),
v("ctr", "Counter"),
v("exp", "Expression"),
v("int", "Interjection"),
v("n", "CommonNoun"),
v("n-adv", "AdverbialNoun"),
v("n-pr", "ProperNoun"),
v("n-pref", "NounPrefix"),
v("n-suf", "NounSuffix"),
v("n-t", "TemporalNoun"),
v("num", "Numeric"),
v("pn", "Pronoun"),
v("pref", "Prefix"),
v("prt", "Particle"),
v("suf", "Suffix"),
v("unc", "Unclassified"),
v("v-unspec", "UnspecifiedVerb"),
v("v1", "IchidanVerb"),
v("v1-s", "IchidanKureruVerb"),
v("v2a-s", "NidanUVerb").when(cfg!(feature = "scope-archaic")),
v("v2b-k", "UpperNidanBuVerb").when(cfg!(feature = "scope-archaic")),
v("v2b-s", "LowerNidanBuVerb").when(cfg!(feature = "scope-archaic")),
v("v2d-k", "UpperNidanDzuVerb").when(cfg!(feature = "scope-archaic")),
v("v2d-s", "LowerNidanDzuVerb").when(cfg!(feature = "scope-archaic")),
v("v2g-k", "UpperNidanGuVerb").when(cfg!(feature = "scope-archaic")),
v("v2g-s", "LowerNidanGuVerb").when(cfg!(feature = "scope-archaic")),
v("v2h-k", "UpperNidanFuVerb").when(cfg!(feature = "scope-archaic")),
v("v2h-s", "LowerNidanFuVerb").when(cfg!(feature = "scope-archaic")),
v("v2k-k", "UpperNidanKuVerb").when(cfg!(feature = "scope-archaic")),
v("v2k-s", "LowerNidanKuVerb").when(cfg!(feature = "scope-archaic")),
v("v2m-k", "UpperNidanMuVerb").when(cfg!(feature = "scope-archaic")),
v("v2m-s", "LowerNidanMuVerb").when(cfg!(feature = "scope-archaic")),
v("v2n-s", "LowerNidanNuVerb").when(cfg!(feature = "scope-archaic")),
v("v2r-k", "UpperNidanRuVerb").when(cfg!(feature = "scope-archaic")),
v("v2r-s", "LowerNidanRuVerb").when(cfg!(feature = "scope-archaic")),
v("v2s-s", "LowerNidanSuVerb").when(cfg!(feature = "scope-archaic")),
v("v2t-k", "UpperNidanTsuVerb").when(cfg!(feature = "scope-archaic")),
v("v2t-s", "LowerNidanTsuVerb").when(cfg!(feature = "scope-archaic")),
v("v2w-s", "LowerNidanUWeVerb").when(cfg!(feature = "scope-archaic")),
v("v2y-k", "UpperNidanYuVerb").when(cfg!(feature = "scope-archaic")),
v("v2y-s", "LowerNidanYuVerb").when(cfg!(feature = "scope-archaic")),
v("v2z-s", "LowerNidanZuVerb").when(cfg!(feature = "scope-archaic")),
v("v4b", "YodanBuVerb").when(cfg!(feature = "scope-archaic")),
v("v4g", "YodanGuVerb").when(cfg!(feature = "scope-archaic")),
v("v4h", "YodanFuVerb").when(cfg!(feature = "scope-archaic")),
v("v4k", "YodanKuVerb").when(cfg!(feature = "scope-archaic")),
v("v4m", "YodanMuVerb").when(cfg!(feature = "scope-archaic")),
v("v4n", "YodanNuVerb").when(cfg!(feature = "scope-archaic")),
v("v4r", "YodanRuVerb").when(cfg!(feature = "scope-archaic")),
v("v4s", "YodanSuVerb").when(cfg!(feature = "scope-archaic")),
v("v4t", "YodanTsuVerb").when(cfg!(feature = "scope-archaic")),
v("v5aru", "GodanAruVerb"),
v("v5b", "GodanBuVerb"),
v("v5g", "GodanGuVerb"),
v("v5k", "GodanKuVerb"),
v("v5k-s", "GodanIkuVerb"),
v("v5m", "GodanMuVerb"),
v("v5n", "GodanNuVerb"),
v("v5r", "GodanRuVerb"),
v("v5r-i", "IrregularGodanRuVerb"),
v("v5s", "GodanSuVerb"),
v("v5t", "GodanTsuVerb"),
v("v5u", "GodanUVerb"),
v("v5u-s", "IrregularGodanUVerb"),
v("vi", "IntransitiveVerb"),
v("vk", "KuruVerb"),
v("vn", "IrregularGodanNuVerb"),
v("vr", "IrregularGodanRuVerbWithPlainRiForm"),
v("vs", "SuruVerb"),
v("vs-c", "SuruPrecursorVerb"),
v("vs-i", "IncludedSuruVerb"),
v("vs-s", "SpecialSuruVerb"),
v("vt", "TransitiveVerb"),
v("vz", "IchidanZuruVerb"),
],
}));
content.push_str(&process(Enum {
name: "ReadingInfo",
all_name: None,
doc: "Information regarding a certain ReadingElement.".into(),
entities: Some(&entities["re_inf"]),
variants: vec![
v("gikun", "GikunOrJukujikun"),
v("ik", "IrregularKanaUsage"),
v("ok", "OutdatedKanaUsage"),
v("uK", "UsuallyWrittenUsingKanjiAlone"),
],
}));
content.push_str(&process(Enum {
name: "SenseInfo",
all_name: None,
doc: "Information regarding a certain Sense.".into(),
entities: Some(&entities["misc"]),
variants: vec![
v("X", "XRated"),
v("abbr", "Abbreviation"),
v("arch", "Archaism"),
v("char", "Character"),
v("chn", "ChildrensLanguage"),
v("col", "Colloquialism"),
v("company", "CompanyName"),
v("creat", "Creature"),
v("dated", "DatedTerm"),
v("dei", "Deity"),
v("derog", "Derogatory"),
v("ev", "Event"),
v("fam", "FamiliarLanguage"),
v("fem", "FemaleTermOrLanguage"),
v("fict", "Fiction"),
v("given", "GivenName"),
v("hist", "HistoricalTerm"),
v("hon", "HonorificLanguage"),
v("hum", "HumbleLanguage"),
v("id", "IdiomaticExpression"),
v("joc", "JocularTerm"),
v("leg", "Legend"),
v("litf", "LiteraryOrFormalTerm"),
v("m-sl", "MangaSlang"),
v("male", "MaleTermOrLanguage"),
v("myth", "Mythology"),
v("net-sl", "InternetSlang"),
v("obj", "Object"),
v("obs", "ObsoleteTerm"),
v("obsc", "ObscureTerm"),
v("on-mim", "Onomatopoeia"),
v("organization", "OrganizationName"),
v("oth", "Other"),
v("person", "PersonName"),
v("place", "PlaceName"),
v("poet", "PoeticalTerm"),
v("pol", "PoliteLanguage"),
v("product", "ProductName"),
v("proverb", "Proverb"),
v("quote", "Quotation"),
v("rare", "Rare"),
v("relig", "Religion"),
v("sens", "Sensitive"),
v("serv", "Service"),
v("sl", "Slang"),
v("station", "RailwayStation"),
v("surname", "Surname"),
v("uk", "UsuallyWrittenUsingKanaAlone"),
v("unclass", "UnclassifiedName"),
v("vulg", "VulgarTerm"),
v("work", "WorkOfArt"),
v("yoji", "Yojijukugo"),
],
}));
content.push_str(&process(Enum {
name: "SenseTopic",
all_name: None,
doc: "Field of study where a certain Sense originates.".into(),
entities: Some(&entities["field"]),
variants: vec![
v("Buddh", "Buddhism"),
v("Christn", "Christianity"),
v("MA", "MartialArts"),
v("Shinto", "Shinto"),
v("agric", "Agriculture"),
v("anat", "Anatomy"),
v("archeol", "Archeology"),
v("archit", "Architecture"),
v("art", "Art"),
v("astron", "Astronomy"),
v("audvid", "AudioVisual"),
v("aviat", "Aviation"),
v("baseb", "Baseball"),
v("biochem", "Biochemistry"),
v("biol", "Biology"),
v("bot", "Botany"),
v("bus", "Business"),
v("chem", "Chemistry"),
v("comp", "Computing"),
v("cryst", "Crystallography"),
v("ecol", "Ecology"),
v("econ", "Economics"),
v("elec", "ElectricalEngineering"),
v("electr", "Electronics"),
v("embryo", "Embryology"),
v("engr", "Engineering"),
v("ent", "Entomology"),
v("finc", "Finance"),
v("fish", "Fishing"),
v("food", "Food"),
v("gardn", "Gardening"),
v("genet", "Genetics"),
v("geogr", "Geography"),
v("geol", "Geology"),
v("geom", "Geometry"),
v("go", "Go"),
v("golf", "Golf"),
v("gramm", "Grammar"),
v("grmyth", "GreekMythology"),
v("hanaf", "Hanafuda"),
v("horse", "Horseracing"),
v("law", "Law"),
v("ling", "Linguistics"),
v("logic", "Logic"),
v("mahj", "Mahjong"),
v("math", "Mathematics"),
v("mech", "MechanicalEngineering"),
v("med", "Medicine"),
v("met", "Meteorology"),
v("mil", "Military"),
v("music", "Music"),
v("ornith", "Ornithology"),
v("paleo", "Paleontology"),
v("pathol", "Pathology"),
v("pharm", "Pharmacy"),
v("phil", "Philosophy"),
v("photo", "Photography"),
v("physics", "Physics"),
v("physiol", "Physiology"),
v("print", "Printing"),
v("psych", "Psychology"),
v("shogi", "Shogi"),
v("sports", "Sports"),
v("stat", "Statistics"),
v("sumo", "Sumo"),
v("telec", "Telecommunications"),
v("tradem", "Trademark"),
v("vidg", "VideoGame"),
v("zool", "Zoology"),
],
}));
let out_dir = std::env::var_os("OUT_DIR").unwrap();
let dest_path = std::path::Path::new(&out_dir).join("generated.rs");
std::fs::write(&dest_path, content).unwrap();
}
fn process(e: Enum) -> String {
let mut lines = vec![];
//render the corresponding fully-populated enum, if requested
if let Some(all_name) = e.all_name {
lines.push(process(Enum {
name: all_name,
all_name: None,
doc: format!("{} This enum contains all possible variants, including those that have been disabled by compile-time flags in `enum {}`.", e.doc, e.name),
entities: e.entities,
variants: e.variants.iter().map(|v| EnumVariant{enabled: true, ..*v}).collect(),
}));
}
//enum declaration
lines.push(format!("/// {}", e.doc));
lines.push("#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]".into());
lines.push(format!("pub enum {} {{", e.name));
for v in e.variants.iter().filter(|v| v.enabled) {
if let Some(ref entities) = e.entities {
lines.push(format!(" ///{}", entities[v.code].as_str().unwrap()));
}
lines.push(format!(" {},", v.name));
}
lines.push("}\n".into());
//start impl Enum
lines.push(format!("impl Enum for {} {{", e.name));
//fn code(&self) -> &str
lines.push(" fn code(&self) -> &'static str {".into());
lines.push(" match *self {".into());
for v in e.variants.iter().filter(|v| v.enabled) {
lines.push(format!(
" {}::{} => \"{}\",",
e.name, v.name, v.code
));
}
lines.push(" }".into());
lines.push(" }\n".into());
//fn constant_name(&self) -> &str
lines.push(" fn constant_name(&self) -> &'static str {".into());
lines.push(" match *self {".into());
for v in e.variants.iter().filter(|v| v.enabled) {
lines.push(format!(
" {}::{} => \"{}\",",
e.name, v.name, v.name
));
}
lines.push(" }".into());
lines.push(" }\n".into());
//fn from_code(&str) -> Self
lines.push(" fn from_code(text: &str) -> Option<Self> {".into());
lines.push(" match text {".into());
for v in e.variants.iter().filter(|v| v.enabled) {
lines.push(format!(
" \"{}\" => Some({}::{}),",
v.code, e.name, v.name
));
}
lines.push(" _ => None,".into());
lines.push(" }".into());
lines.push(" }\n".into());
//end impl Enum
lines.push("}\n".into());
//impl Display
lines.push(format!("impl std::fmt::Display for {} {{", e.name));
lines.push(" fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {".into());
lines.push(" write!(f, \"{}\", self.constant_name())".into());
lines.push(" }".into());
lines.push("}\n".into());
//impl EnumPayload
lines.push(format!("impl EnumPayload for {} {{", e.name));
lines.push(" fn to_u32(&self) -> u32 {".into());
lines.push(" match *self {".into());
for (idx, v) in e.variants.iter().filter(|v| v.enabled).enumerate() {
lines.push(format!(" {}::{} => {},", e.name, v.name, idx));
}
lines.push(" }".into());
lines.push(" }\n".into());
lines.push(" fn from_u32(code: u32) -> Self {".into());
lines.push(" match code {".into());
for (idx, v) in e.variants.iter().filter(|v| v.enabled).enumerate() {
lines.push(format!(" {} => {}::{},", idx, e.name, v.name));
}
lines.push(format!(
" _ => panic!(\"unknown {} code: {{}}\", code),",
e.name
));
lines.push(" }".into());
lines.push(" }".into());
lines.push("}\n".into());
if let Some(all_name) = e.all_name {
//impl TryFrom
lines.push(format!(
"impl std::convert::TryFrom<{}> for {} {{",
all_name, e.name
));
lines.push(" type Error = DisabledVariant;".into());
lines.push(format!(
" fn try_from(value: {}) -> Result<{}, DisabledVariant> {{",
all_name, e.name,
));
lines.push(" match value {".into());
for v in e.variants.iter() {
if v.enabled {
lines.push(format!(
" {}::{} => Ok({}::{}),",
all_name, v.name, e.name, v.name
));
} else {
lines.push(format!(
" {}::{} => Err(DisabledVariant),",
all_name, v.name
));
}
}
lines.push(" }".into());
lines.push(" }".into());
lines.push("}\n".into());
//impl From
lines.push(format!(
"impl std::convert::From<{}> for {} {{",
e.name, all_name
));
lines.push(format!(" fn from(value: {}) -> {} {{", e.name, all_name));
lines.push(" match value {".into());
for v in e.variants.iter().filter(|v| v.enabled) {
lines.push(format!(
" {}::{} => {}::{},",
e.name, v.name, all_name, v.name
));
}
lines.push(" }".into());
lines.push(" }".into());
lines.push("}\n".into());
}
lines.join("\n")
}

@ -0,0 +1,247 @@
{
"dial": {
"hob": "Hokkaido-ben",
"ksb": "Kansai-ben",
"ktb": "Kantou-ben",
"kyb": "Kyoto-ben",
"kyu": "Kyuushuu-ben",
"nab": "Nagano-ben",
"osb": "Osaka-ben",
"rkb": "Ryuukyuu-ben",
"thb": "Touhoku-ben",
"tsb": "Tosa-ben",
"tsug": "Tsugaru-ben"
},
"field": {
"Buddh": "Buddhism",
"Christn": "Christianity",
"MA": "martial arts",
"Shinto": "Shinto",
"agric": "agriculture",
"anat": "anatomy",
"archeol": "archeology",
"archit": "architecture, building",
"art": "art, aesthetics",
"astron": "astronomy",
"audvid": "audio-visual",
"aviat": "aviation",
"baseb": "baseball",
"biochem": "biochemistry",
"biol": "biology",
"bot": "botany",
"bus": "business",
"chem": "chemistry",
"comp": "computing",
"cryst": "crystallography",
"ecol": "ecology",
"econ": "economics",
"elec": "electricity, elec. eng.",
"electr": "electronics",
"embryo": "embryology",
"engr": "engineering",
"ent": "entomology",
"finc": "finance",
"fish": "fishing",
"food": "food, cooking",
"gardn": "gardening, horticulture",
"genet": "genetics",
"geogr": "geography",
"geol": "geology",
"geom": "geometry",
"go": "go (game)",
"golf": "golf",
"gramm": "grammar",
"grmyth": "Greek mythology",
"hanaf": "hanafuda",
"horse": "horse-racing",
"law": "law",
"ling": "linguistics",
"logic": "logic",
"mahj": "mahjong",
"math": "mathematics",
"mech": "mechanical engineering",
"med": "medicine",
"met": "climate, weather",
"mil": "military",
"music": "music",
"ornith": "ornithology",
"paleo": "paleontology",
"pathol": "pathology",
"pharm": "pharmacy",
"phil": "philosophy",
"photo": "photography",
"physics": "physics",
"physiol": "physiology",
"print": "printing",
"psych": "psychology, psychiatry",
"shogi": "shogi",
"sports": "sports",
"stat": "statistics",
"sumo": "sumo",
"telec": "telecommunications",
"tradem": "trademark",
"vidg": "video game",
"zool": "zoology"
},
"ke_inf": {
"ateji": "ateji (phonetic) reading",
"iK": "word containing irregular kanji usage",
"ik": "word containing irregular kana usage",
"io": "irregular okurigana usage",
"oK": "word containing out-dated kanji"
},
"misc": {
"X": "rude or X-rated term (not displayed in educational software)",
"abbr": "abbreviation",
"arch": "archaism",
"char": "character",
"chn": "children's language",
"col": "colloquialism",
"company": "company name",
"creat": "creature",
"dated": "dated term",
"dei": "deity",
"derog": "derogatory",
"ev": "event",
"fam": "familiar language",
"fem": "female term or language",
"fict": "fiction",
"given": "given name or forename, gender not specified",
"hist": "historical term",
"hon": "honorific or respectful (sonkeigo) language",
"hum": "humble (kenjougo) language",
"id": "idiomatic expression",
"joc": "jocular, humorous term",
"leg": "legend",
"litf": "literary or formal term",
"m-sl": "manga slang",
"male": "male term or language",
"myth": "mythology",
"net-sl": "Internet slang",
"obj": "object",
"obs": "obsolete term",
"obsc": "obscure term",
"on-mim": "onomatopoeic or mimetic word",
"organization": "organization name",
"oth": "other",
"person": "full name of a particular person",
"place": "place name",
"poet": "poetical term",
"pol": "polite (teineigo) language",
"product": "product name",
"proverb": "proverb",
"quote": "quotation",
"rare": "rare",
"relig": "religion",
"sens": "sensitive",
"serv": "service",
"sl": "slang",
"station": "railway station",
"surname": "family or surname",
"uk": "word usually written using kana alone",
"unclass": "unclassified name",
"vulg": "vulgar expression or word",
"work": "work of art, literature, music, etc. name",
"yoji": "yojijukugo"
},
"pos": {
"adj-f": "noun or verb acting prenominally",
"adj-i": "adjective (keiyoushi)",
"adj-ix": "adjective (keiyoushi) - yoi/ii class",
"adj-kari": "'kari' adjective (archaic)",
"adj-ku": "'ku' adjective (archaic)",
"adj-na": "adjectival nouns or quasi-adjectives (keiyodoshi)",
"adj-nari": "archaic/formal form of na-adjective",
"adj-no": "nouns which may take the genitive case particle 'no'",
"adj-pn": "pre-noun adjectival (rentaishi)",
"adj-shiku": "'shiku' adjective (archaic)",
"adj-t": "'taru' adjective",
"adv": "adverb (fukushi)",
"adv-to": "adverb taking the 'to' particle",
"aux": "auxiliary",
"aux-adj": "auxiliary adjective",
"aux-v": "auxiliary verb",
"conj": "conjunction",
"cop": "copula",
"ctr": "counter",
"exp": "expressions (phrases, clauses, etc.)",
"int": "interjection (kandoushi)",
"n": "noun (common) (futsuumeishi)",
"n-adv": "adverbial noun (fukushitekimeishi)",
"n-pr": "proper noun",
"n-pref": "noun, used as a prefix",
"n-suf": "noun, used as a suffix",
"n-t": "noun (temporal) (jisoumeishi)",
"num": "numeric",
"pn": "pronoun",
"pref": "prefix",
"prt": "particle",
"suf": "suffix",
"unc": "unclassified",
"v-unspec": "verb unspecified",
"v1": "Ichidan verb",
"v1-s": "Ichidan verb - kureru special class",
"v2a-s": "Nidan verb with 'u' ending (archaic)",
"v2b-k": "Nidan verb (upper class) with 'bu' ending (archaic)",
"v2b-s": "Nidan verb (lower class) with 'bu' ending (archaic)",
"v2d-k": "Nidan verb (upper class) with 'dzu' ending (archaic)",
"v2d-s": "Nidan verb (lower class) with 'dzu' ending (archaic)",
"v2g-k": "Nidan verb (upper class) with 'gu' ending (archaic)",
"v2g-s": "Nidan verb (lower class) with 'gu' ending (archaic)",
"v2h-k": "Nidan verb (upper class) with 'hu/fu' ending (archaic)",
"v2h-s": "Nidan verb (lower class) with 'hu/fu' ending (archaic)",
"v2k-k": "Nidan verb (upper class) with 'ku' ending (archaic)",
"v2k-s": "Nidan verb (lower class) with 'ku' ending (archaic)",
"v2m-k": "Nidan verb (upper class) with 'mu' ending (archaic)",
"v2m-s": "Nidan verb (lower class) with 'mu' ending (archaic)",
"v2n-s": "Nidan verb (lower class) with 'nu' ending (archaic)",
"v2r-k": "Nidan verb (upper class) with 'ru' ending (archaic)",
"v2r-s": "Nidan verb (lower class) with 'ru' ending (archaic)",
"v2s-s": "Nidan verb (lower class) with 'su' ending (archaic)",
"v2t-k": "Nidan verb (upper class) with 'tsu' ending (archaic)",
"v2t-s": "Nidan verb (lower class) with 'tsu' ending (archaic)",
"v2w-s": "Nidan verb (lower class) with 'u' ending and 'we' conjugation (archaic)",
"v2y-k": "Nidan verb (upper class) with 'yu' ending (archaic)",
"v2y-s": "Nidan verb (lower class) with 'yu' ending (archaic)",
"v2z-s": "Nidan verb (lower class) with 'zu' ending (archaic)",
"v4b": "Yodan verb with 'bu' ending (archaic)",
"v4g": "Yodan verb with 'gu' ending (archaic)",
"v4h": "Yodan verb with 'hu/fu' ending (archaic)",
"v4k": "Yodan verb with 'ku' ending (archaic)",
"v4m": "Yodan verb with 'mu' ending (archaic)",
"v4n": "Yodan verb with 'nu' ending (archaic)",
"v4r": "Yodan verb with 'ru' ending (archaic)",
"v4s": "Yodan verb with 'su' ending (archaic)",
"v4t": "Yodan verb with 'tsu' ending (archaic)",
"v5aru": "Godan verb - -aru special class",
"v5b": "Godan verb with 'bu' ending",
"v5g": "Godan verb with 'gu' ending",
"v5k": "Godan verb with 'ku' ending",
"v5k-s": "Godan verb - Iku/Yuku special class",
"v5m": "Godan verb with 'mu' ending",
"v5n": "Godan verb with 'nu' ending",
"v5r": "Godan verb with 'ru' ending",
"v5r-i": "Godan verb with 'ru' ending (irregular verb)",
"v5s": "Godan verb with 'su' ending",
"v5t": "Godan verb with 'tsu' ending",
"v5u": "Godan verb with 'u' ending",
"v5u-s": "Godan verb with 'u' ending (special class)",
"v5uru": "Godan verb - Uru old class verb (old form of Eru)",
"vi": "intransitive verb",
"vk": "Kuru verb - special class",
"vn": "irregular nu verb",
"vr": "irregular ru verb, plain form ends with -ri",
"vs": "noun or participle which takes the aux. verb suru",
"vs-c": "su verb - precursor to the modern suru",
"vs-i": "suru verb - included",
"vs-s": "suru verb - special class",
"vt": "transitive verb",
"vz": "Ichidan verb - zuru verb (alternative form of -jiru verbs)"
},
"re_inf": {
"gikun": "gikun (meaning as reading) or jukujikun (special kanji reading)",
"ik": "word containing irregular kana usage",
"ok": "out-dated or obsolete kana usage",
"uK": "word usually written using kanji alone"
}
}

@ -0,0 +1,177 @@
/*******************************************************************************
* Copyright 2021 Stefan Majewsky <majewsky@gmx.net>
* SPDX-License-Identifier: Apache-2.0
* Refer to the file "LICENSE" for details.
*******************************************************************************/
//! Autogenerated enums for the `jmdict` crate.
//!
//! This code is in a separate crate because, if we put it in the `jmdict` crate itself, its
//! `build.rs` could not import it.
//!
//! # Compatibility promise
//!
//! **There is none.** This crate can disappear at any time if we choose to restructure the build
//! system for the `jmdict` crate. To use the types from this crate, look at the re-exports of the
//! same name in [the `jmdict` crate](https://docs.rs/jmdict/).
///Error type for all enum conversions of the form `impl TryFrom<AllFoo> for Foo`.
///
///The error is returned for variants from the full enum that are disabled in the main enum because
///of the compile-time configuration. For example:
///
///```
///# use jmdict_enums::*;
///use std::convert::TryInto;
///let val: Result<PartOfSpeech, _> = AllPartOfSpeech::NariAdjective.try_into();
///#[cfg(feature = "scope-archaic")]
///assert_eq!(val, Ok(PartOfSpeech::NariAdjective));
///#[cfg(not(feature = "scope-archaic"))]
///assert_eq!(val, Err(DisabledVariant));
///```
#[derive(Clone, Copy, Default, Hash, PartialEq, Eq, Debug)]
pub struct DisabledVariant;
///Internal helper functions for serialization and deserialization of enum values.
///
///This is an internal trait; it is not re-exported by the `jmdict` crate and thus not part of the
///public API.
pub trait EnumPayload {
fn to_u32(&self) -> u32;
fn from_u32(code: u32) -> Self;
}
///Common methods provided by all enums in this crate.
pub trait Enum: Sized {
///Returns the string that marks this enum variant in the JMdict. For values that JMdict
///represents as XML entities, only the entity name is returned, e.g. `adj-n` instead of
///`&adj-n;`.
fn code(&self) -> &'static str;
///Parses a representation from the JMdict file into a value of this enum. This is the reverse
///of `self.code()`, i.e. `Self::from_code(self.code()) == Some(self)`.
fn from_code(code: &str) -> Option<Self>;
///Returns the variant name. This is used to generate Rust code for this enum. The `impl
///Display` for enums uses this same representation.
fn constant_name(&self) -> &'static str;
}
///PriorityInCorpus appears in struct [Priority]. It describes how often a dictionary entry
///appears in a certain corpus of text.
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
pub enum PriorityInCorpus {
///The vocabulary appears often within the given corpus.
Primary,
///The vocabulary appears within the given corpus, but not particularly often.
Secondary,
///The vocabulary does not appear in the given corpus. This is the `Default::default()` value.
Absent,
}
impl Default for PriorityInCorpus {
fn default() -> Self {
Self::Absent
}
}
impl PriorityInCorpus {
fn to_repr(&self) -> u32 {
match *self {
Self::Absent => 0,
Self::Primary => 1,
Self::Secondary => 2,
}
}
fn from_repr(code: u32) -> Self {
match code {
0 => Self::Absent,
1 => Self::Primary,
2 => Self::Secondary,
_ => panic!("invalid PriorityInCorpus code: {}", code),
}
}
}
///Relative priority of a ReadingElement or KanjiElement.
///
///The various fields indicate if the vocabulary appears in various references, which can be taken
///as an indivication of the frequency with which it is used.
///
///For the sake of encoding efficiency, this struct is not a perfect representation of the data in
///the JMdict. Some entries in the JMdict are marked with contradictory priority information. In
///this case, `Priority` will only contain the values corresponding to the highest priority. For
///example, a priority of `ichi1,ichi2,news1,nf09` is represented as:
///
///```
///# use jmdict_enums::{PriorityInCorpus::*, Priority};
///let p = Priority {
/// news: Primary,
/// ichimango: Primary, //"ichi2" gets ignored
/// loanwords: Absent,
/// additional: Absent,
/// frequency_bucket: 9,
///};
///```
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, Hash)]
pub struct Priority {
///If not `Absent`, this vocabulary appears in the wordfreq file compiled by Alexandre Girardi
///from the Mainichi Shimbun. (A copy of the file can be obtained from the EDRDG.)
pub news: PriorityInCorpus,
///If not `Absent`, this vocabulary appears in the book "1万語語彙分類集" (Ichimango goi
///bunruishuu) by Senmon Kyouiku Publishing, Tokyo, 1998. The entries with priority `Secondary`
///were demoted from `Primary` because they were observed to have low frequencies in the WWW
///and newspapers.
pub ichimango: PriorityInCorpus,
///If not `Absent`, this vocabulary is a common loanword that appears in the wordfreq file.
pub loanwords: PriorityInCorpus,
///This covers a small number of words when they are detected as being common, but are not
///included in the above corpora.
pub additional: PriorityInCorpus,
///If `self.news != Absent`, this field contains a value between 1 and 48, indicating the
///frequency-of-use ranking for this vocabulary in the wordfreq file. The value 1 is used for
///the 500 most common words, the value 2 is used for the 500 next most common words, and so
///on. If `self.news == Absent`, this value will be 0.
pub frequency_bucket: u16,
}
impl Priority {
///Indicates whether this is a common vocabulary. This follows the same logic as the `(P)`
///markers in the EDICT and EDICT2 files: A word is common if any of its `PriorityInCorpus`
///fields is `Primary`, or if `self.additional == Secondary`.
pub fn is_common(&self) -> bool {
use PriorityInCorpus::*;
self.news == Primary
|| self.ichimango == Primary
|| self.loanwords == Primary
|| self.additional != Absent
}
}
//Priority gets serialized into u32, same as the enum types. The lower 16 bits are used for the
//frequency buckets. The higher 16 bits are evenly distributed among the four PriorityInCorpus
//fields. The encoding could be denser if we wanted to, but u32 is the smallest encoding unit
//available to us anyway, so we don't need to bother.
impl EnumPayload for Priority {
fn to_u32(&self) -> u32 {
let mut result = self.frequency_bucket as u32;
result |= self.news.to_repr() << 16;
result |= self.ichimango.to_repr() << 20;
result |= self.loanwords.to_repr() << 24;
result |= self.additional.to_repr() << 28;
result
}
fn from_u32(code: u32) -> Self {
Self {
news: PriorityInCorpus::from_repr((code & 0xF0000) >> 16),
ichimango: PriorityInCorpus::from_repr((code & 0xF00000) >> 20),
loanwords: PriorityInCorpus::from_repr((code & 0xF000000) >> 24),
additional: PriorityInCorpus::from_repr((code & 0xF0000000) >> 28),
frequency_bucket: (code & 0xFFFF) as u16,
}
}
}
include!(concat!(env!("OUT_DIR"), "/generated.rs"));

@ -0,0 +1,17 @@
[package]
name = "jmdict-traverse"
version = "0.1.0"
authors = ["Stefan Majewsky <majewsky@gmx.net>"]
edition = "2018"
description = "Build system support for the jmdict crate. Do not import directly."
readme = "README.md"
homepage = "https://github.com/majewsky/rust-jmdict/tree/main/jmdict-traverse"
license = "Apache-2.0"
[dependencies]
jmdict-enums = { path = "../jmdict-enums", version = "0.1.0" }
directories = "^3"
hex-literal = "^0.3"
libflate = "^1"
sha2 = "^0.9"
json = "^0.12"

@ -0,0 +1,12 @@
# jmdict-traverse
Parsing utilities for the build and test phases of the `jmdict` crate.
This code is in a separate crate because, if we put it in the `jmdict` crate itself, its `build.rs`
could not import it.
## Compatibility promise
**There is none.** Although this crate is published on crates.io for technical reasons, this crate
is internal to the `jmdict` crate. Its API may change at any time, including in bugfix releases. Use
the [API provided by the `jmdict` crate](https://docs.rs/jmdict/) instead.

@ -0,0 +1,99 @@
/*******************************************************************************
* Copyright 2021 Stefan Majewsky <majewsky@gmx.net>
* SPDX-License-Identifier: Apache-2.0
* Refer to the file "LICENSE" for details.
*******************************************************************************/
use hex_literal::hex;
use std::path::PathBuf;
const ENTRYPACK_URL: &str = "https://dl.xyrillian.de/jmdict/entrypack-v1-2021-04-13.json.gz";
const ENTRYPACK_SHA256SUM: [u8; 32] =
hex!("9b92671745758a07528a7b9a057c9d4726709b539707786bd281179fd3a4eac3");
pub struct EntryPack {
pub path: PathBuf,
pub sha256sum: Option<&'static [u8; 32]>,
}
impl EntryPack {
pub fn locate_or_download() -> Self {
match std::env::var_os("RUST_JMDICT_ENTRYPACK") {
//download from hard-coded source if explicity requested
Some(s) if s == "default" => Self {
path: download_to_cache(ENTRYPACK_URL),
sha256sum: Some(&ENTRYPACK_SHA256SUM),
},
//use override path if explicitly given
Some(path_str) => Self {
path: path_str.into(),
sha256sum: None,
},
//default behavior: use file from repository for development builds, otherwise download
//from hard-coded source
None => {
let local_path = std::path::Path::new("data/entrypack.json");
if local_path.exists() {
Self {
path: local_path.into(),
sha256sum: None,
}
} else {
Self {
path: download_to_cache(ENTRYPACK_URL),
sha256sum: Some(&ENTRYPACK_SHA256SUM),
}
}
}
}
}
pub fn contents(&self) -> String {
use libflate::gzip::Decoder;
use sha2::{Digest, Sha256};
use std::io::Read;
let data = std::fs::read(&self.path).unwrap();
if let Some(expected_hash) = self.sha256sum {
let hash = Sha256::digest(&data[..]);
assert_eq!(&hash[..], expected_hash);
}
//check for GZip magic number
if data[0] == 31 && data[1] == 139 {
let mut decoder = Decoder::new(&data[..]).unwrap();
let mut result = String::with_capacity(100 << 20);
decoder.read_to_string(&mut result).unwrap();
result
} else {
String::from_utf8(data).unwrap()
}
}
}
fn download_to_cache(url: &str) -> PathBuf {
//construct path of the form "$HOME/.cache/rust-jmdict/entrypack-YYYY-MM-DD.json.gz"
let base_dirs = directories::BaseDirs::new().unwrap();
let mut path = PathBuf::new();
path.push(base_dirs.cache_dir());
path.push("rust-jmdict");
std::fs::create_dir_all(&path).unwrap();
let basename = url.rsplit('/').next().unwrap();
path.push(&basename);
//only need to download if not present yet
if !path.exists() {
//download with `curl`
let status = std::process::Command::new("curl")
.arg("--fail")
.arg("--silent")
.arg("--output")
.arg(path.as_os_str())
.arg(url)
.status()
.expect("failed to execute curl");
assert!(status.success(), "{}", status);
}
path
}

@ -0,0 +1,365 @@
/*******************************************************************************
* Copyright 2021 Stefan Majewsky <majewsky@gmx.net>
* SPDX-License-Identifier: Apache-2.0
* Refer to the file "LICENSE" for details.
*******************************************************************************/
//! Parsing utilities for the build and test phases of the `jmdict` crate.
//!
//! This code is in a separate crate because, if we put it in the `jmdict` crate itself, its
//! `build.rs` could not import it.
//!
//! # Compatibility promise
//!
//! **There is none.** Although this crate is published on crates.io for technical reasons, this
//! crate is internal to the `jmdict` crate. Its API may change at any time, including in
//! bugfix releases. Use the [API provided by the `jmdict` crate](https://docs.rs/jmdict/) instead.
use jmdict_enums::{
AllGlossLanguage, AllPartOfSpeech, Dialect, Enum, GlossLanguage, GlossType, KanjiInfo,
PartOfSpeech, Priority, PriorityInCorpus, ReadingInfo, SenseInfo, SenseTopic,
};
use json::JsonValue;
use std::convert::TryInto;
mod entrypack;
use entrypack::EntryPack;
pub struct RawEntry<'a> {
pub ent_seq: u32,
pub k_ele: Vec<RawKanjiElement<'a>>,
pub r_ele: Vec<RawReadingElement<'a>>,
pub sense: Vec<RawSense<'a>>,
}
pub struct RawKanjiElement<'a> {
pub keb: &'a str,
pub ke_inf: Vec<KanjiInfo>,
pub ke_pri: Priority,
}
pub struct RawReadingElement<'a> {
pub reb: &'a str,
pub re_nokanji: bool,
pub re_restr: Vec<&'a str>,
pub re_inf: Vec<ReadingInfo>,
pub re_pri: Priority,
}
pub struct RawSense<'a> {
pub stagk: Vec<&'a str>,
pub stagr: Vec<&'a str>,
pub pos: Vec<PartOfSpeech>,
pub xref: Vec<&'a str>,
pub ant: Vec<&'a str>,
pub field: Vec<SenseTopic>,
pub misc: Vec<SenseInfo>,
pub s_inf: Vec<&'a str>,
pub lsource: Vec<RawLSource<'a>>,
pub dial: Vec<Dialect>,
pub gloss: Vec<RawGloss<'a>>,
}
pub struct RawLSource<'a> {
//NOTE: We do not use the GlossLanguage enum for the lang attribute, because doing so would add
//a very long tail of rare loanword source languages to that enum. (Also, we could not restrict
//variants of GlossLanguage to feature flags in the way we currently do.)
pub text: &'a str,
pub lang: &'a str,
pub is_partial: bool,
pub is_wasei: bool,
}
pub struct RawGloss<'a> {
//NOTE: g_gend and pri are not mapped since they do not actually occur in any entries
pub text: &'a str,
pub lang: GlossLanguage,
pub g_type: GlossType,
}
///Strategy for processing a JMdict file.
pub trait Visitor {
fn process_entry(&mut self, entry: &RawEntry);
///This is called once for each file that was read from disk. The build script uses this to
///generate `cargo:rerun-if-changed` directives.
fn notify_data_file_path(&mut self, _path: &str) {}
}
///Options for traversing a JMdict file. This controls which entries the [Visitor] visits, and
///which parts of the entries it sees.
pub struct Options {
pub is_db_minimal: bool,
pub with_uncommon: bool,
pub with_archaic: bool,
}
///Entry point for this file. All other functions are called directly or indirectly from this fn.
pub fn process_dictionary<V: Visitor>(v: &mut V, opts: Options) {
let entrypack = EntryPack::locate_or_download();
v.notify_data_file_path(&entrypack.path.to_string_lossy());
for entry_str in entrypack.contents().split('\n') {
if !entry_str.is_empty() {
let entry_obj = json::parse(entry_str).unwrap();
if let Some(entry_raw) = RawEntry::from_obj(&entry_obj, &opts) {
if opts.is_db_minimal && entry_raw.ent_seq >= 1010000 {
//for db-minimal, only process entries from data/entries-100.json
return;
}
v.process_entry(&entry_raw);
}
}
}
}
trait Object<'a>: Sized {
fn from_obj(obj: &'a JsonValue, opts: &'_ Options) -> Option<Self>;
fn collect(array: &'a JsonValue, opts: &'_ Options) -> Vec<Self> {
assert!(array.is_null() || array.is_array());
array
.members()
.filter_map(|obj| Self::from_obj(obj, opts))
.collect()
}
fn collect_or_none(array: &'a JsonValue, opts: &'_ Options) -> Option<Vec<Self>> {
let vec = Self::collect(array, opts);
if vec.is_empty() {
None
} else {
Some(vec)
}
}
}
impl<'a> Object<'a> for RawEntry<'a> {
fn from_obj(obj: &'a JsonValue, opts: &'_ Options) -> Option<Self> {
Some(Self {
ent_seq: obj["n"].as_u32().unwrap(),
k_ele: RawKanjiElement::collect(&obj["K"], opts),
r_ele: RawReadingElement::collect_or_none(&obj["R"], opts)?,
sense: RawSense::collect_or_none(&obj["S"], opts)?,
})
}
}
impl<'a> Object<'a> for RawKanjiElement<'a> {
fn from_obj(obj: &'a JsonValue, opts: &'_ Options) -> Option<Self> {
if !opts.with_uncommon && obj["p"].is_empty() {
return None;
}
Some(Self {
keb: obj["t"].as_str().unwrap(),
ke_inf: Object::collect(&obj["i"], opts),
ke_pri: parse_prio(Object::collect(&obj["p"], opts)),
})
}
}
impl<'a> Object<'a> for RawReadingElement<'a> {
fn from_obj(obj: &'a JsonValue, opts: &'_ Options) -> Option<Self> {
if !opts.with_uncommon && obj["p"].is_empty() {
return None;
}
Some(Self {
reb: obj["t"].as_str().unwrap(),
re_nokanji: obj["n"].as_bool().unwrap_or(false),
re_restr: Object::collect(&obj["r"], opts),
re_inf: Object::collect(&obj["i"], opts),
re_pri: parse_prio(Object::collect(&obj["p"], opts)),
})
}
}
fn parse_prio(markers: Vec<&str>) -> Priority {
use PriorityInCorpus::*;
let mut result = Priority {
news: Absent,
ichimango: Absent,
loanwords: Absent,
additional: Absent,
frequency_bucket: 0,
};
for marker in markers {
match marker {
"news1" => result.news = merge_cprio(result.news, Primary),
"news2" => result.news = merge_cprio(result.news, Secondary),
"ichi1" => result.ichimango = merge_cprio(result.ichimango, Primary),
"ichi2" => result.ichimango = merge_cprio(result.ichimango, Secondary),
"gai1" => result.loanwords = merge_cprio(result.loanwords, Primary),
"gai2" => result.loanwords = merge_cprio(result.loanwords, Secondary),
"spec1" => result.additional = merge_cprio(result.additional, Primary),
"spec2" => result.additional = merge_cprio(result.additional, Secondary),
_ => match parse_freq_bucket(marker) {
Some(bucket) => {
if result.frequency_bucket == 0 || result.frequency_bucket > bucket {
result.frequency_bucket = bucket;
}
}
None => {
panic!("unknown priority marker: {}", marker);
}
},
};
}
result
}
fn merge_cprio(old: PriorityInCorpus, new: PriorityInCorpus) -> PriorityInCorpus {
use PriorityInCorpus::*;
match (old, new) {
(Absent, _) => new,
(_, Primary) => Primary,
(Primary, _) => Primary,
(Secondary, _) => Secondary,
}
}
///Parses a frequency bucket marker for the news corpus, e.g. "nf18" => Some(18).
fn parse_freq_bucket(marker: &str) -> Option<u16> {
//NOTE: This would be easier with a regex library, but I'm definitely not pulling in an entire
//regex crate for just this one thing.
let mut c = marker.chars();
if c.next()? != 'n' {
return None;
}
if c.next()? != 'f' {
return None;
}
let tens = c.next()?.to_digit(10)? as u16;
let ones = c.next()?.to_digit(10)? as u16;
if c.next().is_some() {
return None;
}
let result = 10 * tens + ones;
//only nf01..nf48 are allowed
if result == 0 || result > 48 {
None
} else {
Some(result)
}
}
impl<'a> Object<'a> for RawSense<'a> {
fn from_obj(obj: &'a JsonValue, opts: &'_ Options) -> Option<Self> {
let misc = Object::collect(&obj["m"], opts);
if !opts.with_archaic && misc.contains(&SenseInfo::Archaism) {
return None;
}
Some(Self {
stagk: Object::collect(&obj["stagk"], opts),
stagr: Object::collect(&obj["stagr"], opts),
pos: Object::collect(&obj["p"], opts),
xref: Object::collect(&obj["xref"], opts),
ant: Object::collect(&obj["ant"], opts),
field: Object::collect(&obj["f"], opts),
misc,
s_inf: Object::collect(&obj["i"], opts),
lsource: Object::collect(&obj["L"], opts),
dial: Object::collect(&obj["dial"], opts),
gloss: Object::collect_or_none(&obj["G"], opts)?,
})
}
}
impl<'a> Object<'a> for RawLSource<'a> {
fn from_obj(obj: &'a JsonValue, _opts: &'_ Options) -> Option<Self> {
let is_partial = match obj["type"].as_str().unwrap_or("full") {
"full" => false,
"part" => true,
val => panic!("unknown ls_type: {}", val),
};
let is_wasei = match obj["wasei"].as_str().unwrap_or("n") {
"n" => false,
"y" => true,
val => panic!("unknown ls_wasei: {}", val),
};
Some(Self {
text: obj["t"].as_str().unwrap(),
lang: obj["l"].as_str().unwrap_or("eng"),
is_partial,
is_wasei,
})
}
}
impl<'a> Object<'a> for RawGloss<'a> {
fn from_obj(obj: &'a JsonValue, opts: &'_ Options) -> Option<Self> {
Some(Self {
text: obj["t"].as_str().unwrap(),
lang: GlossLanguage::from_obj(&obj["l"], opts)?,
g_type: optional_enum(&obj["g_type"], "", "GlossType"),
})
}
}
impl<'a> Object<'a> for &'a str {
fn from_obj(obj: &'a JsonValue, _opts: &'_ Options) -> Option<Self> {
Some(obj.as_str().unwrap())
}
}
impl<'a> Object<'a> for Dialect {
fn from_obj(obj: &'a JsonValue, _opts: &'_ Options) -> Option<Self> {
Some(required_enum(obj, "Dialect"))
}
}
impl<'a> Object<'a> for GlossLanguage {
fn from_obj(obj: &'a JsonValue, _opts: &'_ Options) -> Option<Self> {
let lang: AllGlossLanguage = optional_enum(obj, "eng", "AllGlossLanguage");
lang.try_into().ok()
}
}
impl<'a> Object<'a> for KanjiInfo {
fn from_obj(obj: &'a JsonValue, _opts: &'_ Options) -> Option<Self> {
Some(required_enum(obj, "KanjiInfo"))
}
}
impl<'a> Object<'a> for PartOfSpeech {
fn from_obj(obj: &'a JsonValue, _opts: &'_ Options) -> Option<Self> {
let lang: AllPartOfSpeech = optional_enum(obj, "eng", "AllPartOfSpeech");
lang.try_into().ok()
}
}
impl<'a> Object<'a> for ReadingInfo {
fn from_obj(obj: &'a JsonValue, _opts: &'_ Options) -> Option<Self> {
Some(required_enum(obj, "ReadingInfo"))
}
}
impl<'a> Object<'a> for SenseInfo {
fn from_obj(obj: &'a JsonValue, _opts: &'_ Options) -> Option<Self> {
Some(required_enum(obj, "SenseInfo"))
}
}
impl<'a> Object<'a> for SenseTopic {
fn from_obj(obj: &'a JsonValue, _opts: &'_ Options) -> Option<Self> {
Some(required_enum(obj, "SenseTopic"))
}
}
fn optional_enum<E: Enum>(obj: &JsonValue, default: &'static str, enum_name: &'static str) -> E {
let code = obj.as_str().unwrap_or(default);
match E::from_code(code) {
Some(val) => val,
None => panic!("unknown {} representation: {}", enum_name, code),
}
}
fn required_enum<E: Enum>(obj: &JsonValue, enum_name: &'static str) -> E {
let code = obj.as_str().unwrap();
match E::from_code(code) {
Some(val) => val,
None => panic!("unknown {} representation: {}", enum_name, code),
}
}

@ -0,0 +1,390 @@
/*******************************************************************************
* Copyright 2021 Stefan Majewsky <majewsky@gmx.net>
* SPDX-License-Identifier: Apache-2.0
* Refer to the file "LICENSE" for details.
*******************************************************************************/
//! The [JMdict file](https://www.edrdg.org/jmdict/j_jmdict.html) is a comprehensive multilingual
//! dictionary of the Japanese language. The original JMdict file, included in this repository (and
//! hence, in releases of this crate) comes as XML. Instead of stuffing the XML in the binary
//! directly, this crate parses the XML at compile-time and generates an optimized representation
//! that is compiled into the binary. The crate's API affords type-safe access to this embedded
//! database.
//!
//! # WARNING: Licensing on database files
//!
//! The database files compiled into the crate are licensed from the Electronic Dictionary Research
//! and Development Group under Creative Commons licenses. Applications linking this crate directly
//! oder indirectly must display appropriate copyright notices to users. Please refer to the
//! [EDRDG's license statement](https://www.edrdg.org/edrdg/licence.html) for details.
//!
//! # Basic usage
//!
//! The database is accessed through the [entries() function](entries) which provides an iterator
//! over all database entries compiled into the application. While traversing the database and its
//! entries, you will find that, whenever you expect a list of something, you will get an iterator
//! instead. These iterators provide an abstraction between you as the user of the library, and the
//! physical representation of the database as embedded in the binary.
//!
//! The following example looks up the reading for お母さん in the database:
//!
//! ```
//! let kanji_form = "お母さん";
//!
//! let entry = jmdict::entries().find(|e| {
//! e.kanji_elements().any(|k| k.text == kanji_form)
//! }).unwrap();
//!
//! let reading_form = entry.reading_elements().next().unwrap().text;
//! assert_eq!(reading_form, "おかあさん");
//! ```
//!
//! # Cargo features
//!
//! ### Common configurations
//!
//! * The `default` feature includes the most common words (about 30000 entries) and only their
//! English translations.
//! * The `full` feature includes everything in the JMdict.
//!
//! ### Entry selection
//!
//! * The `scope-uncommon` feature includes uncommon words and glosses.
//! * The `scope-archaic` feature includes glosses with the "archaic" label. If disabled, the
//! [PartOfSpeech] enum will not include variants that are only relevant for archaic vocabulary,
//! such as obsolete conjugation patterns. (The [AllPartOfSpeech] enum always contains all
//! variants.)
//!
//! ### Target languages
//!
//! At least one target language must be selected. Selecting a target language will include all
//! available translations in that language. Entries that do not have any translation in any of the
//! selected languages will be skipped.
//!
//! * `translations-eng`: English (included in `default`)
//! * `translations-dut`: Dutch
//! * `translations-fre`: French
//! * `translations-ger`: German
//! * `translations-hun`: Hungarian
//! * `translations-rus`: Russian
//! * `translations-slv`: Slovenian
//! * `translations-spa`: Spanish
//! * `translations-swe`: Swedish
//!
//! The [GlossLanguage] enum will only contain variants corresponding to the enabled target
//! languages. For example, in the default configuration, `GlossLanguage::English` will be the only
//! variant. (The [AllGlossLanguage] enum always contains all variants.)
//!
//! ### Crippled builds: `db-minimal`
//!
//! When the `db-minimal` feature is enabled, only a severly reduced portion of the JMdict will
//! be parsed (to be exact, only chunks 000, 100 and 999). This is also completely useless for
//! actual usage, but allows for quick edit-compile-test cycles while working on this crate's
//! code.
//!
//! ### Crippled builds: `db-empty`
//!
//! When the `db-empty` feature is enabled, downloading and parsing of the JMdict contents is
//! disabled entirely. The crate is compiled as usual, but `entries()` will be an empty list.
//! This is useful for documentation builds like for `docs.rs`, where `--all-features` is given.
pub use jmdict_enums::{
AllGlossLanguage, AllPartOfSpeech, Dialect, DisabledVariant, Enum, GlossLanguage, GlossType,
KanjiInfo, PartOfSpeech, Priority, PriorityInCorpus, ReadingInfo, SenseInfo, SenseTopic,
};
mod payload;
use payload::*;
#[cfg(test)]
mod test_consistency;
#[cfg(test)]
mod test_feature_matrix;
#[cfg(test)]
mod test_ordering;
///Returns an iterator over all entries in the database.
pub fn entries() -> Entries {
Entries::new()
}
///An entry in the JMdict dictionary.
///
///Each entry has zero or more [kanji elements](KanjiElement), one or more
///[reading elements](ReadingElement) and one or more [senses](Sense). Elements contain the
///Japanese representation of the vocabulary or phrase. Whereas reading elements consist of only
///kana, kanji elements will contain characters from non-kana scripts, most commonly kanji. Senses
///contain the translation of the vocabulary or phrase in other languages, most commonly English.
#[derive(Clone, Copy, Debug)]
pub struct Entry {
///The sequence number for this Entry as it appears in the JMdict. Numbers start around 1000000
///and typically increment in steps of 5 or 10. (It's like BASIC line numbers, if you're old
///enough to understand that reference.) The [Entries] iterator guarantees entries to appear
///ordered by sequence number.
pub number: u32,
kanji_elements_iter: KanjiElements,
reading_elements_iter: ReadingElements,
senses_iter: Senses,
}
impl Entry {
pub fn kanji_elements(&self) -> KanjiElements {
self.kanji_elements_iter
}
pub fn reading_elements(&self) -> ReadingElements {
self.reading_elements_iter
}
pub fn senses(&self) -> Senses {
self.senses_iter
}
}
///A representation of a dictionary entry using kanji or other non-kana scripts.
///
///Each [Entry] may have any number of these (including none). For each kanji element, the entry
///will also have [reading elements](ReadingElement) to indicate how to read this kanji element.
#[derive(Clone, Copy, Debug)]
pub struct KanjiElement {
pub text: &'static str,
pub priority: Priority,
info_iter: KanjiInfos,
}
impl KanjiElement {
pub fn infos(&self) -> KanjiInfos {
self.info_iter
}
}
///A representation of a dictionary entry using only kana.
///
///Each [Entry] will have zero or more of these. When an entry has both kanji elements and reading
///elements, the kana usage will be consistent between them, that is: If the kanji element contains
///katakana, there is also a corresponding reading element that contains katakana as well.
#[derive(Clone, Copy, Debug)]
pub struct ReadingElement {
pub text: &'static str,
pub priority: Priority,
info_iter: ReadingInfos,
}
impl ReadingElement {
pub fn infos(&self) -> ReadingInfos {
self.info_iter
}
}
///The translational equivalent of a Japanese word or phrase.
///
///Where there are several distinctly different meanings of the word, its [Entry] will have
///multiple senses. Each particular translation is a [Gloss], of which there may be multiple within
///a single sense.
///
///For instance, the entry for 折角 contains one sense with the glosses "with trouble" and "at
///great pains". Those glosses all represent the same meaning, so they appear in one sense. There
///is also a sense with the glosses "rare", "precious", "valuable" and "long-awaited". Those
///glosses represent a different meaning from "with trouble" or "at great pains", so they appear in
///a separate sense. (And in fact, 折角 has even more senses.)
#[derive(Clone, Copy, Debug)]
pub struct Sense {
stagk_iter: Strings,
stagr_iter: Strings,
pos_iter: PartsOfSpeech,
cross_refs_iter: Strings,
antonyms_iter: Strings,
topics_iter: SenseTopics,
info_iter: SenseInfos,
freetext_info_iter: Strings,
loanword_sources_iter: LoanwordSources,
dialects_iter: Dialects,
glosses_iter: Glosses,
}
impl Sense {
///If not empty, this sense only applies to these [KanjiElements] out of all the
///[KanjiElements] in this [Entry].
pub fn applicable_kanji_elements(&self) -> Strings {
self.stagk_iter
}
///If not empty, this sense only applies to these [ReadingElements] out of all the
///[ReadingElements] in this [Entry].
pub fn applicable_reading_elements(&self) -> Strings {
self.stagr_iter
}
pub fn parts_of_speech(&self) -> PartsOfSpeech {
self.pos_iter
}
///If not empty, contains the text of [KanjiElements] or [ReadingElements] of other [Entries]
///with a similar meaning or sense. In some cases, a [KanjiElement]'s text will be followed by
///a [Reading Element]'s text and/or a sense number to provide a precise target for the
///cross-reference. Where this happens, a katakana middle dot (`・`, U+30FB) is placed between
///the components of the cross-reference.
///
///TODO: Provide a structured type for these kinds of references.
pub fn cross_references(&self) -> Strings {
self.cross_refs_iter
}
///If not empty, contains the text of [KanjiElements] or [ReadingElements] of other [Entries]
///which are antonyms of this sense.
pub fn antonyms(&self) -> Strings {
self.antonyms_iter
}
pub fn topics(&self) -> SenseTopics {
self.topics_iter
}
pub fn infos(&self) -> SenseInfos {
self.info_iter
}
///If not empty, contains additional information about this sence (e.g. level of currency or
///other nuances) that cannot be expressed by the other, more structured fields.
pub fn freetext_infos(&self) -> Strings {
self.freetext_info_iter
}
///If not empty, contains source words in other languages from which this vocabulary has been
///borrowed in this sense.
pub fn loanword_sources(&self) -> LoanwordSources {
self.loanword_sources_iter
}
///If not empty, this [Sense] of the [Entry] only appears in the given [Dialects] of Japanese.
pub fn dialects(&self) -> Dialects {
self.dialects_iter
}
pub fn glosses(&self) -> Glosses {
self.glosses_iter
}
}
///A source word in other language which a particular [Sense] of an [Entry] has been borrowed from.
///
///There may be multiple sources for a single [Sense] when it is not clear from which language a
///word has been borrowed (e.g. "セレナーデ" lists both the French word "sérénade" and the German
///word "Serenade" as loanword sources), or if the vocabulary is a composite word with multiple
///distinct sources (e.g. "サブリュック" is a combination of the English prefix "sub-" and the
///German word "Rucksack").
///
///Within an [Entry], glosses appear in the [Sense].
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub struct LoanwordSource {
pub text: &'static str,
///The [ISO 639-2/B code](https://en.wikipedia.org/wiki/List_of_ISO_639-2_codes) for the
///language from which the word was borrowed, e.g. "ger" for German or "chi" for Chinese.
pub language: &'static str,
///Whether this source applies only to part of the loanword. Note that this flag is not always
///present in the JMdict when it should be.
pub is_partial: bool,
///Whether this loanword is a [Wasei-eigo](https://en.wikipedia.org/wiki/Wasei-eigo).
pub is_wasei: bool,
}
///A particular translation or explanation for a Japanese word or phrase in a different language.
///
///Within an [Entry], glosses appear in the [Sense].
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub struct Gloss {
pub language: GlossLanguage,
pub text: &'static str,
pub gloss_type: GlossType,
}
///We cannot do `pub type KanjiElements = Range<KanjiElement, N>` etc. because Range<T, N> is
///private to the crate, so instead we declare a bunch of iterator types that wrap Range<T, N>.
macro_rules! wrap_iterator {
($val: ty, $size: literal, $iter: ident) => {
///An iterator providing fast access to objects in the database. Instances of this iterator
///can be copied cheaply.
#[derive(Clone, Copy, Debug)]
pub struct $iter(Range<$val, $size>);
impl From<Range<$val, $size>> for $iter {
fn from(r: Range<$val, $size>) -> $iter {
$iter(r)
}
}
impl std::iter::Iterator for $iter {
type Item = $val;
fn next(&mut self) -> Option<Self::Item> {
self.0.next()
}
fn size_hint(&self) -> (usize, Option<usize>) {
self.0.size_hint()
}
}
impl std::iter::ExactSizeIterator for $iter {
fn len(&self) -> usize {
self.0.len()
}
}
};
}
wrap_iterator!(KanjiElement, 5, KanjiElements);
wrap_iterator!(KanjiInfo, 1, KanjiInfos);
wrap_iterator!(ReadingElement, 5, ReadingElements);
wrap_iterator!(ReadingInfo, 1, ReadingInfos);
wrap_iterator!(Sense, 5, Senses);
wrap_iterator!(&'static str, 2, Strings);
wrap_iterator!(PartOfSpeech, 1, PartsOfSpeech);
wrap_iterator!(SenseTopic, 1, SenseTopics);
wrap_iterator!(SenseInfo, 1, SenseInfos);
wrap_iterator!(LoanwordSource, 5, LoanwordSources);
wrap_iterator!(Dialect, 1, Dialects);
wrap_iterator!(Gloss, 3, Glosses);
///An iterator providing fast access to objects in the database. Instances of this iterator
///can be copied cheaply.
#[derive(Clone, Copy)]
pub struct Entries {
//This iterator is very similar to Range<T, N>, but cannot be implemented in terms of it
//because it iterates over ALL_ENTRY_OFFSETS instead of ALL_DATA.
start: usize,
end: usize,
}
impl Entries {
fn new() -> Self {
Self {
start: 0,
end: entry_count(),
}
}
}
impl std::iter::Iterator for Entries {
type Item = Entry;
fn next(&mut self) -> Option<Self::Item> {
if self.start < self.end {
let entry = get_entry(self.start);
self.start += 1;
Some(entry)
} else {
None
}
}
fn size_hint(&self) -> (usize, Option<usize>) {
let count = self.end - self.start;
(count, Some(count))
}
}
impl std::iter::ExactSizeIterator for Entries {
fn len(&self) -> usize {
self.end - self.start
}
}

@ -0,0 +1,232 @@
/*******************************************************************************
* Copyright 2021 Stefan Majewsky <majewsky@gmx.net>
* SPDX-License-Identifier: Apache-2.0
* Refer to the file "LICENSE" for details.
*******************************************************************************/
//! This file contains the type definitions for the database payload. Because we want the payload
//! format to be an implementation detail, the entire module is private and hence these types are
//! not part of the public API.
use crate::*;
use std::convert::TryInto;
use std::marker::PhantomData;
////////////////////////////////////////////////////////////////////////////////
// generic machinery for iterating over ALL_DATA
pub(crate) trait FromPayload<const N: usize> {
///Given `&ALL_DATA[offset..]`, unmarshals the data starting from that offset into a value of
///self. Returns the unmarshaled value, as well as the amount of u32 that were consumed.
fn get(data: &[u32; N]) -> Self;
}
#[derive(Clone, Copy, Debug)]
pub(crate) struct Range<T: FromPayload<N>, const N: usize> {
pub start: usize,
pub end: usize,
pub phantom: PhantomData<T>,
}
impl<T: FromPayload<N>, const N: usize> Range<T, N> {
pub(crate) fn new(start: u32, end: u32) -> Self {
Self {
start: start.try_into().unwrap(),
end: end.try_into().unwrap(),
phantom: PhantomData,
}
}
}
impl<T: FromPayload<N>, const N: usize> std::iter::Iterator for Range<T, N> {
type Item = T;
fn next(&mut self) -> Option<Self::Item> {
if self.start < self.end {
let data = &as_u32_slice(ALL_DATA)[self.start..(self.start + N)];
let item = T::get(data.try_into().unwrap());
self.start += N;
Some(item)
} else {
None
}
}
fn size_hint(&self) -> (usize, Option<usize>) {
let count = (self.end - self.start) / N;
(count, Some(count))
}
}
impl<T: FromPayload<N>, const N: usize> std::iter::ExactSizeIterator for Range<T, N> {
fn len(&self) -> usize {
(self.end - self.start) / N
}
}
////////////////////////////////////////////////////////////////////////////////
// concrete types
pub(crate) fn entry_count() -> usize {
as_u32_slice(ALL_ENTRY_OFFSETS).len()
}
pub(crate) fn get_entry(idx: usize) -> Entry {
let offset: usize = as_u32_slice(ALL_ENTRY_OFFSETS)[idx].try_into().unwrap();
let data = &as_u32_slice(ALL_DATA)[offset..(offset + 4)];
let (start, end) = (data[0], data[1]);
let mid1 = start + (data[2] & 0x0000FFFF);
let mid2 = start + ((data[2] & 0xFFFF0000) >> 16);
Entry {
number: data[3],
kanji_elements_iter: Range::new(start, mid1).into(),
reading_elements_iter: Range::new(mid1, mid2).into(),
senses_iter: Range::new(mid2, end).into(),
}
}
impl FromPayload<5> for KanjiElement {
fn get(data: &[u32; 5]) -> Self {
Self {
priority: jmdict_enums::EnumPayload::from_u32(data[0]),
text: get_str(data[1], data[2]),
info_iter: Range::new(data[3], data[4]).into(),
}
}
}
impl FromPayload<1> for KanjiInfo {
fn get(data: &[u32; 1]) -> Self {
jmdict_enums::EnumPayload::from_u32(data[0])
}
}
impl FromPayload<5> for ReadingElement {
fn get(data: &[u32; 5]) -> Self {
Self {
priority: jmdict_enums::EnumPayload::from_u32(data[0]),
text: get_str(data[1], data[2]),
info_iter: Range::new(data[3], data[4]).into(),
}
}
}
impl FromPayload<1> for ReadingInfo {
fn get(data: &[u32; 1]) -> Self {
jmdict_enums::EnumPayload::from_u32(data[0])
}
}
impl FromPayload<5> for Sense {
fn get(data: &[u32; 5]) -> Self {
let (start, end) = (data[0], data[1]);
let mid1 = start + (data[2] & 0x000000FF);
let mid2 = start + ((data[2] & 0x0000FF00) >> 8);
let mid3 = start + ((data[2] & 0x00FF0000) >> 16);
let mid4 = start + ((data[2] & 0xFF000000) >> 24);
let mid5 = start + (data[3] & 0x000000FF);
let mid6 = start + ((data[3] & 0x0000FF00) >> 8);
let mid7 = start + ((data[3] & 0x00FF0000) >> 16);
let mid8 = start + ((data[3] & 0xFF000000) >> 24);
let mid9 = start + (data[4] & 0x000000FF);
let mid10 = start + ((data[4] & 0x0000FF00) >> 8);
Self {
stagk_iter: Range::new(start, mid1).into(),
stagr_iter: Range::new(mid1, mid2).into(),
pos_iter: Range::new(mid2, mid3).into(),
cross_refs_iter: Range::new(mid3, mid4).into(),
antonyms_iter: Range::new(mid4, mid5).into(),
topics_iter: Range::new(mid5, mid6).into(),
info_iter: Range::new(mid6, mid7).into(),
freetext_info_iter: Range::new(mid7, mid8).into(),
loanword_sources_iter: Range::new(mid8, mid9).into(),
dialects_iter: Range::new(mid9, mid10).into(),
glosses_iter: Range::new(mid10, end).into(),
}
}
}
impl FromPayload<1> for PartOfSpeech {
fn get(data: &[u32; 1]) -> Self {
jmdict_enums::EnumPayload::from_u32(data[0])
}
}
impl FromPayload<1> for SenseTopic {
fn get(data: &[u32; 1]) -> Self {
jmdict_enums::EnumPayload::from_u32(data[0])
}
}
impl FromPayload<1> for SenseInfo {
fn get(data: &[u32; 1]) -> Self {
jmdict_enums::EnumPayload::from_u32(data[0])
}
}
impl FromPayload<5> for LoanwordSource {
fn get(data: &[u32; 5]) -> Self {
Self {
text: get_str(data[0], data[1]),
language: get_str(data[2], data[3]),
is_partial: (data[4] & 0x1) == 0x1,
is_wasei: (data[4] & 0x2) == 0x2,
}
}
}
impl FromPayload<1> for Dialect {
fn get(data: &[u32; 1]) -> Self {
jmdict_enums::EnumPayload::from_u32(data[0])
}
}
impl FromPayload<3> for Gloss {
fn get(data: &[u32; 3]) -> Self {
let lang_code = data[2] & 0x0000FFFF;
let type_code = (data[2] & 0xFFFF0000) >> 16;
Gloss {
text: get_str(data[0], data[1]),
language: jmdict_enums::EnumPayload::from_u32(lang_code),
gloss_type: jmdict_enums::EnumPayload::from_u32(type_code),
}
}
}
impl FromPayload<2> for &'static str {
fn get(data: &[u32; 2]) -> Self {
get_str(data[0], data[1])
}
}
fn get_str(start: u32, end: u32) -> &'static str {
let start = start.try_into().unwrap();
let end = end.try_into().unwrap();
&ALL_TEXTS[start..end]
}
////////////////////////////////////////////////////////////////////////////////
// embedded data
//NOTE: We would only need 4-byte alignment, but 16-byte is the smallest alignment interval that
//the align_data crate offers.
//
//NOTE 2: as_u32_slice() cannot be made const because from_raw_parts() is not const, so we have to
//use it on every read access to the respective arrays.
use align_data::{include_aligned, Align16};
fn as_u32_slice(input: &'static [u8]) -> &'static [u32] {
unsafe {
let ptr = input.as_ptr() as *const u32;
std::slice::from_raw_parts(ptr, input.len() / 4)
}
}
static ALL_ENTRY_OFFSETS: &[u8] =
include_aligned!(Align16, concat!(env!("OUT_DIR"), "/entry_offsets.dat"));
static ALL_DATA: &[u8] = include_aligned!(Align16, concat!(env!("OUT_DIR"), "/payload.dat"));
static ALL_TEXTS: &str = include_str!(concat!(env!("OUT_DIR"), "/strings.txt"));

@ -0,0 +1,115 @@
/*******************************************************************************
* Copyright 2021 Stefan Majewsky <majewsky@gmx.net>
* SPDX-License-Identifier: Apache-2.0
* Refer to the file "LICENSE" for details.
*******************************************************************************/
use std::fmt::Debug;
#[test]
fn check_consistency() {
//This test runs through the data files in the repository a second time and checks that
//entries() contains exactly what we want. This test especially verifies that all indexes into
//omniarrays are within bounds and point to the right stuff.
struct Visitor(crate::Entries);
impl jmdict_traverse::Visitor for Visitor {
fn process_entry(&mut self, entry: &jmdict_traverse::RawEntry) {
match self.0.next() {
None => panic!("jmdict::entries() exhausted before end of traversal"),
Some(actual) => entry.check(&actual),
};
}
}
let opts = jmdict_traverse::Options {
is_db_minimal: cfg!(feature = "db-minimal"),
with_uncommon: cfg!(feature = "scope-uncommon"),
with_archaic: cfg!(feature = "scope-archaic"),
};
let mut v = Visitor(crate::entries());
jmdict_traverse::process_dictionary(&mut v, opts);
assert!(v.0.next().is_none(), "not all entries were exhausted");
}
trait Check<A> {
fn check(&self, actual: &A);
}
fn check_vec<A, E: Check<A>>(
expected: &Vec<E>,
actual: impl Iterator<Item = A> + ExactSizeIterator,
) {
assert_eq!(expected.len(), actual.len());
for (expected, actual) in expected.iter().zip(actual) {
expected.check(&actual);
}
}
impl<E: Debug + PartialEq<A>, A: Debug + PartialEq<E>> Check<A> for E {
fn check(&self, actual: &A) {
assert_eq!(self, actual);
}
}
impl Check<crate::Entry> for jmdict_traverse::RawEntry<'_> {
fn check(&self, actual: &crate::Entry) {
let expected = self;
check_vec(&expected.k_ele, actual.kanji_elements());
check_vec(&expected.r_ele, actual.reading_elements());
check_vec(&expected.sense, actual.senses());
}
}
impl Check<crate::KanjiElement> for jmdict_traverse::RawKanjiElement<'_> {
fn check(&self, actual: &crate::KanjiElement) {
let expected = self;
assert_eq!(expected.keb, actual.text);
check_vec(&expected.ke_inf, actual.infos());
}
}
impl Check<crate::ReadingElement> for jmdict_traverse::RawReadingElement<'_> {
fn check(&self, actual: &crate::ReadingElement) {
let expected = self;
assert_eq!(expected.reb, actual.text);
check_vec(&expected.re_inf, actual.infos());
}
}
impl Check<crate::Sense> for jmdict_traverse::RawSense<'_> {
fn check(&self, actual: &crate::Sense) {
let expected = self;
check_vec(&expected.stagk, actual.applicable_kanji_elements());
check_vec(&expected.stagr, actual.applicable_reading_elements());
check_vec(&expected.pos, actual.parts_of_speech());
check_vec(&expected.xref, actual.cross_references());
check_vec(&expected.ant, actual.antonyms());
check_vec(&expected.field, actual.topics());
check_vec(&expected.misc, actual.infos());
check_vec(&expected.s_inf, actual.freetext_infos());
check_vec(&expected.lsource, actual.loanword_sources());
check_vec(&expected.dial, actual.dialects());
check_vec(&expected.gloss, actual.glosses());
}
}
impl Check<crate::LoanwordSource> for jmdict_traverse::RawLSource<'_> {
fn check(&self, actual: &crate::LoanwordSource) {
let expected = self;
assert_eq!(expected.lang, actual.language);
assert_eq!(expected.text, actual.text);
assert_eq!(expected.is_partial, actual.is_partial);
assert_eq!(expected.is_wasei, actual.is_wasei);
}
}
impl Check<crate::Gloss> for jmdict_traverse::RawGloss<'_> {
fn check(&self, actual: &crate::Gloss) {
let expected = self;
assert_eq!(expected.lang, actual.language);
assert_eq!(expected.text, actual.text);
assert_eq!(expected.g_type, actual.gloss_type);
}
}

@ -0,0 +1,292 @@
/*******************************************************************************
* Copyright 2021 Stefan Majewsky <majewsky@gmx.net>
* SPDX-License-Identifier: Apache-2.0
* Refer to the file "LICENSE" for details.
*******************************************************************************/
use crate::*;
//NOTE: Choose test words such that tests work with the `db-minimal` feature.
//We want the CI run to complete before we retire.
///Checks that glosses for the selected target languages are available.
#[test]
fn test_gloss_availability() {
let entry = entries()
.find(|e| e.kanji_elements().any(|k| k.text == "お母さん"))
.unwrap();
//while we're at it, test the decoding of entry numbers
assert_eq!(entry.number, 1002650);
let test_cases = &[
("eng", cfg!(feature = "translations-eng"), "mom"),
(
"dut",
cfg!(feature = "translations-dut"),
"moeder {honorifieke term}",
),
("fre", cfg!(feature = "translations-fre"), "mère"),
("ger", cfg!(feature = "translations-ger"), "Mama"),
("hun", cfg!(feature = "translations-hun"), "anya-"),
("rus", cfg!(feature = "translations-rus"), "мама, мамочка"),
("slv", cfg!(feature = "translations-slv"), "mati"),
("spa", cfg!(feature = "translations-spa"), "madre"),
("swe", cfg!(feature = "translations-swe"), "mamma"),
];
for (lang_code, selected, gloss) in test_cases {
let glosses: Vec<_> = entry
.senses()
.flat_map(|s| s.glosses())
.filter(|g| g.language.code() == *lang_code)
.map(|g| g.text)
.collect();
assert_eq!(
*selected,
!glosses.is_empty(),
"language code was {}",
*lang_code
);
if *selected {
assert!(glosses.contains(gloss), "glosses were {:?}", glosses);
}
}
}
///Spot checks for correct decoding of priorities.
#[test]
fn test_priorities() {
//Tests may be skipped if the test entry is not available, since entry
//availability depends on the selection of target languages.
if let Some((_, ke)) = find_by_keb("お参り") {
assert_eq!(
ke.priority,
Priority {
ichimango: PriorityInCorpus::Primary,
news: PriorityInCorpus::Secondary,
frequency_bucket: 36,
..Default::default()
}
);
}
if let Some((_, _, re)) = find_by_keb_reb("あの方", "あのかた") {
assert_eq!(
re.priority,
Priority {
additional: PriorityInCorpus::Primary,
..Default::default()
}
);
}
//`db-minimal` does not contain any gai1/gai2 vocabs
#[cfg(not(feature = "db-minimal"))]
{
if let Some((_, re)) = find_by_reb("アーク") {
assert_eq!(
re.priority,
Priority {
loanwords: PriorityInCorpus::Primary,
..Default::default()
}
);
}
}
}
///Spot checks for correct decoding of enums.
#[test]
fn test_enums() {
//Tests may be skipped if the test entry is not available, since entry
//availability depends on the selection of target languages.
//check for KanjiInfo
if let Some((_, ke)) = find_by_keb("屹度") {
assert_eq!(enum2str(ke.infos()), "Ateji");
}
//check for ReadingInfo (There are no entries with ReadingInfo in "db-minimal"
//unless we include "scope-uncommon".)
let (keb, reb, expected_infos) = if cfg!(feature = "db-minimal") {
if cfg!(feature = "scope-uncommon") {
("彼処", "あしこ", "OutdatedKanaUsage")
} else {
("", "", "")
}
} else {
("発条", "ばね", "GikunOrJukujikun")
};
if keb != "" {
if let Some((_, _, re)) = find_by_keb_reb(keb, reb) {
assert_eq!(enum2str(re.infos()), expected_infos);
}
}
//All Sense lookups rely on a certain gloss, so we need to feature-gate on the gloss language.
#[cfg(feature = "translations-eng")]
{
//check for PartOfSpeech
let sense = find_sense("あっさり", "easily");
assert_eq!(
enum2str(sense.parts_of_speech()),
"Adverb,AdverbTakingToParticle,SuruVerb"
);
//check for SenseTopic
let sense = find_sense("御田", "oden");
assert_eq!(enum2str(sense.topics()), "Food");
//check for SenseInfo
let sense = find_sense("うんこ", "poop");
assert_eq!(enum2str(sense.infos()), "Colloquialism,ChildrensLanguage");
//check for Dialect
let sense = find_sense("ええ", "good");
assert_eq!(enum2str(sense.dialects()), "Kansai");
//check for GlossType
let gloss_text = "in the time it takes to say \"ah!\"";
let sense = find_sense("あっという間に", gloss_text);
let gloss = sense.glosses().find(|g| g.text == gloss_text).unwrap();
assert_eq!(gloss.gloss_type, GlossType::LiteralTranslation);
}
}
///Spot checks for correct inclusion of various string fields.
#[test]
fn test_strings() {
//All Sense lookups rely on a certain gloss, so we need to feature-gate on the gloss language.
#[cfg(feature = "translations-eng")]
{
//check for stagk
let (sense, expected_stagk) = if cfg!(feature = "db-minimal") {
if cfg!(feature = "scope-uncommon") {
(Some(find_sense("遇う", "to treat")), "遇う")
} else {
(None, "")
}
} else {
(
Some(find_sense("アンド", "AND (boolean operator)")),
"",
)
};
if let Some(sense) = sense {
assert_eq!(strs2str(sense.applicable_kanji_elements()), expected_stagk);
}
//check for stagr
let sense = find_sense("彼処", "genitals");
assert_eq!(
strs2str(sense.applicable_reading_elements()),
"あそこ,あすこ,アソコ"
);
//check for xref
let sense = find_sense("彼の", "the");
assert_eq!(strs2str(sense.cross_references()), "どの,この・1,その・1");
//check for ant (`db-minimal` has absolutely none of those)
#[cfg(not(feature = "db-minimal"))]
{
let sense = find_sense("アンダー", "under");
assert_eq!(strs2str(sense.antonyms()), "オーバー・2");
}
//check for s_inf
let sense = find_sense("如何にも", "indeed");
assert_eq!(
strs2str(sense.freetext_infos()),
"indicating emotive conviction"
);
}
}
///Spot checks for correct encoding of loanword sources.
#[test]
fn test_loanword_sources() {
//All Sense lookups rely on a certain gloss, so we need to feature-gate on the gloss language.
//Also, `db-minimal` has nearly no loanword sources to work with.
#[cfg(all(feature = "translations-eng", not(feature = "db-minimal")))]
{
let sense = find_sense("アイメート", "seeing-eye dog");
assert_eq!(
&sense.loanword_sources().collect::<Vec<_>>(),
&[LoanwordSource {
text: "eye mate",
language: "eng",
is_partial: false,
is_wasei: true,
}]
);
//test with partial loanword sources
#[cfg(feature = "scope-uncommon")]
{
let sense = find_sense("サブザック", "small knapsack");
assert_eq!(
&sense.loanword_sources().collect::<Vec<_>>(),
&[
LoanwordSource {
text: "sub",
language: "eng",
is_partial: true,
is_wasei: true,
},
LoanwordSource {
text: "Sack",
language: "ger",
is_partial: true,
is_wasei: true,
}
]
);
}
}
}
fn enum2str<E: Enum>(vals: impl Iterator<Item = E>) -> String {
strs2str(vals.map(|v| v.constant_name()))
}
fn strs2str<'a>(vals: impl Iterator<Item = &'a str>) -> String {
vals.enumerate()
.map(|(i, v)| if i == 0 { v.into() } else { format!(",{}", v) })
.collect()
}
fn find_by_keb(keb: &'static str) -> Option<(Entry, KanjiElement)> {
let e = entries().find(|e| e.kanji_elements().any(|k| k.text == keb))?;
Some((e, e.kanji_elements().find(|k| k.text == keb).unwrap()))
}
fn find_by_reb(reb: &'static str) -> Option<(Entry, ReadingElement)> {
let e = entries().find(|e| e.reading_elements().any(|r| r.text == reb))?;
Some((e, e.reading_elements().find(|r| r.text == reb).unwrap()))
}
fn find_by_keb_reb(
keb: &'static str,
reb: &'static str,
) -> Option<(Entry, KanjiElement, ReadingElement)> {
let e = entries().find(|e| e.kanji_elements().any(|k| k.text == keb))?;
let ke = e.kanji_elements().find(|k| k.text == keb).unwrap();
let re = e.reading_elements().find(|r| r.text == reb)?;
Some((e, ke, re))
}
fn find_sense(jp_text: &'static str, gloss: &'static str) -> Sense {
entries()
.find(|e| {
(e.kanji_elements().any(|k| k.text == jp_text)
|| e.reading_elements().any(|r| r.text == jp_text))
&& e.senses().any(|s| s.glosses().any(|g| g.text == gloss))
})
.unwrap()
.senses()
.find(|s| s.glosses().any(|g| g.text == gloss))
.unwrap()
}

@ -0,0 +1,16 @@
/*******************************************************************************
* Copyright 2021 Stefan Majewsky <majewsky@gmx.net>
* SPDX-License-Identifier: Apache-2.0
* Refer to the file "LICENSE" for details.
*******************************************************************************/
use crate::entries;
#[test]
fn test_entry_order() {
let mut prev = 0;
for entry in entries() {
assert!(entry.number > prev, "{} comes after {}", entry.number, prev);
prev = entry.number;
}
}
Loading…
Cancel
Save