forked from firka/flutter
Binary format for hyphenation patterns
In the current state, hyphenation in all languages than Sanskrit seems to work (case-folding edge cases). Thus, we just disable Sanskrit. Packed tries are implemented, but not the finite state machine (space/speed tradeoff). This commit contains a throw-away test app, which runs on the host. I think I want to replace it with unit tests, but I'm including it in the CL because it's useful during development. Bug: 21562869 Bug: 21826930 Bug: 23317038 Bug: 23317904 Change-Id: I7479a565a4a062fa319651c2c14c0fa18c5ceaea
This commit is contained in:
36
engine/src/flutter/app/Android.mk
Normal file
36
engine/src/flutter/app/Android.mk
Normal file
@@ -0,0 +1,36 @@
|
||||
# Copyright (C) 2015 The Android Open Source Project
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# see how_to_run.txt for instructions on running these tests
|
||||
|
||||
LOCAL_PATH := $(call my-dir)
|
||||
|
||||
include $(CLEAR_VARS)
|
||||
|
||||
LOCAL_MODULE := hyphtool
|
||||
LOCAL_MODULE_TAGS := optional
|
||||
|
||||
LOCAL_STATIC_LIBRARIES := libminikin_host
|
||||
|
||||
# Shared libraries which are dependencies of minikin; these are not automatically
|
||||
# pulled in by the build system (and thus sadly must be repeated).
|
||||
|
||||
LOCAL_SHARED_LIBRARIES := \
|
||||
liblog \
|
||||
libicuuc-host
|
||||
|
||||
LOCAL_SRC_FILES += \
|
||||
HyphTool.cpp
|
||||
|
||||
include $(BUILD_HOST_EXECUTABLE)
|
||||
62
engine/src/flutter/app/HyphTool.cpp
Normal file
62
engine/src/flutter/app/HyphTool.cpp
Normal file
@@ -0,0 +1,62 @@
|
||||
#include <stdio.h>
|
||||
#include <sys/stat.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "utils/Log.h"
|
||||
|
||||
#include <vector>
|
||||
#include <minikin/Hyphenator.h>
|
||||
|
||||
using android::Hyphenator;
|
||||
|
||||
Hyphenator* loadHybFile(const char* fn) {
|
||||
struct stat statbuf;
|
||||
int status = stat(fn, &statbuf);
|
||||
if (status < 0) {
|
||||
fprintf(stderr, "error opening %s\n", fn);
|
||||
return nullptr;
|
||||
}
|
||||
size_t size = statbuf.st_size;
|
||||
FILE* f = fopen(fn, "rb");
|
||||
if (f == NULL) {
|
||||
fprintf(stderr, "error opening %s\n", fn);
|
||||
return nullptr;
|
||||
}
|
||||
uint8_t* buf = new uint8_t[size];
|
||||
size_t read_size = fread(buf, 1, size, f);
|
||||
if (read_size < size) {
|
||||
fprintf(stderr, "error reading %s\n", fn);
|
||||
delete[] buf;
|
||||
return nullptr;
|
||||
}
|
||||
return Hyphenator::loadBinary(buf);
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
Hyphenator* hyph = loadHybFile("/tmp/en.hyb"); // should also be configurable
|
||||
std::vector<uint8_t> result;
|
||||
std::vector<uint16_t> word;
|
||||
if (argc < 2) {
|
||||
fprintf(stderr, "usage: hyphtool word\n");
|
||||
return 1;
|
||||
}
|
||||
char* asciiword = argv[1];
|
||||
size_t len = strlen(asciiword);
|
||||
for (size_t i = 0; i < len; i++) {
|
||||
uint32_t c = asciiword[i];
|
||||
if (c == '-') {
|
||||
c = 0x00AD;
|
||||
}
|
||||
// ASCII (or possibly ISO Latin 1), but kinda painful to do utf conversion :(
|
||||
word.push_back(c);
|
||||
}
|
||||
hyph->hyphenate(&result, word.data(), word.size());
|
||||
for (size_t i = 0; i < len; i++) {
|
||||
if (result[i] != 0) {
|
||||
printf("-");
|
||||
}
|
||||
printf("%c", word[i]);
|
||||
}
|
||||
printf("\n");
|
||||
return 0;
|
||||
}
|
||||
135
engine/src/flutter/doc/hyb_file_format.md
Normal file
135
engine/src/flutter/doc/hyb_file_format.md
Normal file
@@ -0,0 +1,135 @@
|
||||
# Hyb (hyphenation pattern binary) file format
|
||||
|
||||
The hyb file format is how hyphenation patterns are stored in the system image.
|
||||
|
||||
Goals include:
|
||||
|
||||
* Concise (system image space is at a premium)
|
||||
* Usable when mmap'ed, so it doesn't take significant physical RAM
|
||||
* Fast to compute
|
||||
* Simple
|
||||
|
||||
It is _not_ intended as an interchange format, so there is no attempt to make the format
|
||||
extensible or facilitate backward and forward compatibility.
|
||||
|
||||
Further, at some point we will probably pack patterns for multiple languages into a single
|
||||
physical file, to reduce number of open mmap'ed files. This document doesn't cover that.
|
||||
|
||||
## Theoretical basis
|
||||
|
||||
At heart, the file contains packed tries with suffix compression, actually quite similar
|
||||
to the implementation in TeX.
|
||||
|
||||
The file contains three sections. The first section represents the "alphabet," including
|
||||
case folding. It is effectively a map from Unicode code point to a small integer.
|
||||
|
||||
The second section contains the trie in packed form. It is an array of 3-tuples, packed
|
||||
into a 32 bit integer. Each (suffix-compressed) trie node has a unique index within this
|
||||
array, and the pattern field in the tuple is the pattern for that node. Further, each edge
|
||||
in the trie has an entry in the array, and the character and link fields in the tuple
|
||||
represent the label and destination of the edge. The packing strategy is as in
|
||||
[Word Hy-phen-a-tion by Com-put-er](http://www.tug.org/docs/liang/liang-thesis.pdf) by
|
||||
Franklin Mark Liang.
|
||||
|
||||
The trie representation is similar but not identical to the "double-array trie".
|
||||
The fundamental operation of lookup of the edge from `s` to `t` with label `c` is
|
||||
to compare `c == character[s + c]`, and if so, `t = link[s + c]`.
|
||||
|
||||
The third section contains the pattern strings. This section is in two parts: first,
|
||||
an array with a 3-tuple for each pattern (length, number of trailing 0's, and offset
|
||||
into the string pool); and second, the string pool. Each pattern is encoded as a byte
|
||||
(packing 2 per byte would be possible but the space savings would not be signficant).
|
||||
|
||||
As much as possible of the file is represented as 32 bit integers, as that is especially
|
||||
efficent to access. All are little-endian (this could be revised if the code ever needs
|
||||
to be ported to big-endian systems).
|
||||
|
||||
## Header
|
||||
|
||||
```
|
||||
uint32_t magic == 0x62ad7968
|
||||
uint32_t version = 0
|
||||
uint32_t alphabet_offset (in bytes)
|
||||
uint32_t trie_offset (in bytes)
|
||||
uint32_t pattern_offset (in bytes)
|
||||
uint32_t file_size (in bytes)
|
||||
```
|
||||
|
||||
Offsets are from the front of the file, and in bytes.
|
||||
|
||||
## Alphabet
|
||||
|
||||
The alphabet table comes in two versions. The first is well suited to dense Unicode
|
||||
ranges and is limited to 256. The second is more general, but lookups will be slower.
|
||||
|
||||
### Alphabet, direct version
|
||||
|
||||
```
|
||||
uint32_t version = 0
|
||||
uint32_t min_codepoint
|
||||
uint32_t max_codepoint (exclusive)
|
||||
uint8_t[] data
|
||||
```
|
||||
|
||||
The size of the data array is max_codepoint - min_codepoint. 0 represents an unmapped
|
||||
character. Note that, in the current implementation, automatic hyphenation is disabled
|
||||
for any word containing an unmapped character.
|
||||
|
||||
In general, pad bytes follow this table, aligning the next table to a 4-byte boundary.
|
||||
|
||||
### Alphabet, general version
|
||||
|
||||
```
|
||||
uint32_t version = 1
|
||||
uint32_t n_entries
|
||||
uint32_t[n_entries] data
|
||||
```
|
||||
|
||||
Each element in the data table is `(codepoint << 11) | value`. Note that this is
|
||||
restricted to 11 bits (2048 possible values). The largest known current value is 483
|
||||
(for Sanskrit).
|
||||
|
||||
The entries are sorted by codepoint, to facilitate binary search. Another reasonable
|
||||
implementation for consumers of the data would be to build a hash table at load time.
|
||||
|
||||
## Trie
|
||||
|
||||
```
|
||||
uint32_t version = 0
|
||||
uint32_t char_mask
|
||||
uint32_t link_shift
|
||||
uint32_t link_mask
|
||||
uint32_t pattern_shift
|
||||
uint32_t n_entries
|
||||
uint32_t[n_entries] data
|
||||
```
|
||||
|
||||
Each element in the data table is `(pattern << pattern_shift) | (link << link_shift) | char`.
|
||||
|
||||
All known pattern tables fit in 32 bits total. If this is exceeded, there is a fairly
|
||||
straightforward tweak, where each node occupies a slot by itself (as opposed to sharing
|
||||
it with edge slots), which would require very minimal changes to the implementation (TODO
|
||||
present in more detail).
|
||||
|
||||
## Pattern
|
||||
|
||||
```
|
||||
uint32_t version = 0
|
||||
uint32_t n_entries
|
||||
uint32_t pattern_offset (in bytes)
|
||||
uint32_t pattern_size (in bytes)
|
||||
uint32_t[n_entries] data
|
||||
uint8_t[] pattern_buf
|
||||
```
|
||||
|
||||
Each element in data table is `(len << 26) | (shift << 20) | offset`, where an offset of 0
|
||||
points to the first byte of pattern_buf.
|
||||
|
||||
Generally pattern_offset is `16 + 4 * n_entries`.
|
||||
|
||||
For example, 'a4m5ato' would be represented as `[4, 5, 0, 0, 0]`, then len = 2, shift = 3, and
|
||||
offset points to [4, 5] in the pattern buffer.
|
||||
|
||||
Future extension: additional data representing nonstandard hyphenation. See
|
||||
[Automatic non-standard hyphenation in OpenOffice.org](https://www.tug.org/TUGboat/tb27-1/tb86nemeth.pdf)
|
||||
for more information about that issue.
|
||||
@@ -26,11 +26,8 @@
|
||||
|
||||
namespace android {
|
||||
|
||||
class Trie {
|
||||
public:
|
||||
std::vector<uint8_t> result;
|
||||
std::unordered_map<uint16_t, Trie> succ;
|
||||
};
|
||||
// hyb file header; implementation details are in the .cpp file
|
||||
struct Header;
|
||||
|
||||
class Hyphenator {
|
||||
public:
|
||||
@@ -44,19 +41,43 @@ public:
|
||||
// Example: word is "hyphen", result is [0 0 1 0 0 0], corresponding to "hy-phen".
|
||||
void hyphenate(std::vector<uint8_t>* result, const uint16_t* word, size_t len);
|
||||
|
||||
private:
|
||||
void addPattern(const uint16_t* pattern, size_t size);
|
||||
// pattern data is in binary format, as described in doc/hyb_file_format.md. Note:
|
||||
// the caller is responsible for ensuring that the lifetime of the pattern data is
|
||||
// at least as long as the Hyphenator object.
|
||||
|
||||
void hyphenateSoft(std::vector<uint8_t>* result, const uint16_t* word, size_t len);
|
||||
// Note: nullptr is valid input, in which case the hyphenator only processes soft hyphens
|
||||
static Hyphenator* loadBinary(const uint8_t* patternData);
|
||||
|
||||
private:
|
||||
// apply soft hyphens only, ignoring patterns
|
||||
void hyphenateSoft(uint8_t* result, const uint16_t* word, size_t len);
|
||||
|
||||
// try looking up word in alphabet table, return false if any code units fail to map
|
||||
// Note that this methor writes len+2 entries into alpha_codes (including start and stop)
|
||||
bool alphabetLookup(uint16_t* alpha_codes, const uint16_t* word, size_t len);
|
||||
|
||||
// calculate hyphenation from patterns, assuming alphabet lookup has already been done
|
||||
void hyphenateFromCodes(uint8_t* result, const uint16_t* codes, size_t len);
|
||||
|
||||
// TODO: these should become parameters, as they might vary by locale, screen size, and
|
||||
// possibly explicit user control.
|
||||
static const int MIN_PREFIX = 2;
|
||||
static const int MIN_SUFFIX = 3;
|
||||
|
||||
Trie root;
|
||||
// See also LONGEST_HYPHENATED_WORD in LineBreaker.cpp. Here the constant is used so
|
||||
// that temporary buffers can be stack-allocated without waste, which is a slightly
|
||||
// different use case. It measures UTF-16 code units.
|
||||
static const size_t MAX_HYPHENATED_SIZE = 64;
|
||||
|
||||
const uint8_t* patternData;
|
||||
|
||||
// accessors for binary data
|
||||
const Header* getHeader() const {
|
||||
return reinterpret_cast<const Header*>(patternData);
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
} // namespace android
|
||||
|
||||
#endif // MINIKIN_HYPHENATOR_H
|
||||
#endif // MINIKIN_HYPHENATOR_H
|
||||
|
||||
@@ -63,3 +63,17 @@ LOCAL_C_INCLUDES := $(minikin_c_includes)
|
||||
LOCAL_SHARED_LIBRARIES := $(minikin_shared_libraries)
|
||||
|
||||
include $(BUILD_STATIC_LIBRARY)
|
||||
|
||||
include $(CLEAR_VARS)
|
||||
|
||||
# Reduced library (currently just hyphenation) for host
|
||||
|
||||
LOCAL_MODULE := libminikin_host
|
||||
LOCAL_MODULE_TAGS := optional
|
||||
LOCAL_EXPORT_C_INCLUDE_DIRS := frameworks/minikin/include
|
||||
LOCAL_C_INCLUDES := $(minikin_c_includes)
|
||||
LOCAL_SHARED_LIBRARIES := liblog libicuuc-host
|
||||
|
||||
LOCAL_SRC_FILES := Hyphenator.cpp
|
||||
|
||||
include $(BUILD_HOST_STATIC_LIBRARY)
|
||||
|
||||
@@ -34,130 +34,202 @@ namespace android {
|
||||
|
||||
static const uint16_t CHAR_SOFT_HYPHEN = 0x00AD;
|
||||
|
||||
void Hyphenator::addPattern(const uint16_t* pattern, size_t size) {
|
||||
vector<uint16_t> word;
|
||||
vector<uint8_t> result;
|
||||
// The following are structs that correspond to tables inside the hyb file format
|
||||
|
||||
// start by parsing the Liang-format pattern into a word and a result vector, the
|
||||
// vector right-aligned but without leading zeros. Examples:
|
||||
// a1bc2d -> abcd [1, 0, 2, 0]
|
||||
// abc1 -> abc [1]
|
||||
// 1a2b3c4d5 -> abcd [1, 2, 3, 4, 5]
|
||||
bool lastWasLetter = false;
|
||||
bool haveSeenNumber = false;
|
||||
for (size_t i = 0; i < size; i++) {
|
||||
uint16_t c = pattern[i];
|
||||
if (isdigit(c)) {
|
||||
result.push_back(c - '0');
|
||||
lastWasLetter = false;
|
||||
haveSeenNumber = true;
|
||||
} else {
|
||||
word.push_back(c);
|
||||
if (lastWasLetter && haveSeenNumber) {
|
||||
result.push_back(0);
|
||||
}
|
||||
lastWasLetter = true;
|
||||
}
|
||||
}
|
||||
if (lastWasLetter) {
|
||||
result.push_back(0);
|
||||
}
|
||||
Trie* t = &root;
|
||||
for (size_t i = 0; i < word.size(); i++) {
|
||||
t = &t->succ[word[i]];
|
||||
}
|
||||
t->result = result;
|
||||
}
|
||||
struct AlphabetTable0 {
|
||||
uint32_t version;
|
||||
uint32_t min_codepoint;
|
||||
uint32_t max_codepoint;
|
||||
uint8_t data[1]; // actually flexible array, size is known at runtime
|
||||
};
|
||||
|
||||
// If any soft hyphen is present in the word, use soft hyphens to decide hyphenation,
|
||||
// as recommended in UAX #14 (Use of Soft Hyphen)
|
||||
void Hyphenator::hyphenateSoft(vector<uint8_t>* result, const uint16_t* word, size_t len) {
|
||||
(*result)[0] = 0;
|
||||
for (size_t i = 1; i < len; i++) {
|
||||
(*result)[i] = word[i - 1] == CHAR_SOFT_HYPHEN;
|
||||
struct AlphabetTable1 {
|
||||
uint32_t version;
|
||||
uint32_t n_entries;
|
||||
uint32_t data[1]; // actually flexible array, size is known at runtime
|
||||
|
||||
static uint32_t codepoint(uint32_t entry) { return entry >> 11; }
|
||||
static uint32_t value(uint32_t entry) { return entry & 0x7ff; }
|
||||
};
|
||||
|
||||
struct Trie {
|
||||
uint32_t version;
|
||||
uint32_t char_mask;
|
||||
uint32_t link_shift;
|
||||
uint32_t link_mask;
|
||||
uint32_t pattern_shift;
|
||||
uint32_t n_entries;
|
||||
uint32_t data[1]; // actually flexible array, size is known at runtime
|
||||
};
|
||||
|
||||
struct Pattern {
|
||||
uint32_t version;
|
||||
uint32_t n_entries;
|
||||
uint32_t pattern_offset;
|
||||
uint32_t pattern_size;
|
||||
uint32_t data[1]; // actually flexible array, size is known at runtime
|
||||
|
||||
// accessors
|
||||
static uint32_t len(uint32_t entry) { return entry >> 26; }
|
||||
static uint32_t shift(uint32_t entry) { return (entry >> 20) & 0x3f; }
|
||||
const uint8_t* buf(uint32_t entry) const {
|
||||
return reinterpret_cast<const uint8_t*>(this) + pattern_offset + (entry & 0xfffff);
|
||||
}
|
||||
};
|
||||
|
||||
struct Header {
|
||||
uint32_t magic;
|
||||
uint32_t version;
|
||||
uint32_t alphabet_offset;
|
||||
uint32_t trie_offset;
|
||||
uint32_t pattern_offset;
|
||||
uint32_t file_size;
|
||||
|
||||
// accessors
|
||||
const uint8_t* bytes() const { return reinterpret_cast<const uint8_t*>(this); }
|
||||
uint32_t alphabetVersion() const {
|
||||
return *reinterpret_cast<const uint32_t*>(bytes() + alphabet_offset);
|
||||
}
|
||||
const AlphabetTable0* alphabetTable0() const {
|
||||
return reinterpret_cast<const AlphabetTable0*>(bytes() + alphabet_offset);
|
||||
}
|
||||
const AlphabetTable1* alphabetTable1() const {
|
||||
return reinterpret_cast<const AlphabetTable1*>(bytes() + alphabet_offset);
|
||||
}
|
||||
const Trie* trieTable() const {
|
||||
return reinterpret_cast<const Trie*>(bytes() + trie_offset);
|
||||
}
|
||||
const Pattern* patternTable() const {
|
||||
return reinterpret_cast<const Pattern*>(bytes() + pattern_offset);
|
||||
}
|
||||
};
|
||||
|
||||
Hyphenator* Hyphenator::loadBinary(const uint8_t* patternData) {
|
||||
Hyphenator* result = new Hyphenator;
|
||||
result->patternData = patternData;
|
||||
return result;
|
||||
}
|
||||
|
||||
void Hyphenator::hyphenate(vector<uint8_t>* result, const uint16_t* word, size_t len) {
|
||||
result->clear();
|
||||
result->resize(len);
|
||||
if (len < MIN_PREFIX + MIN_SUFFIX) return;
|
||||
size_t maxOffset = len - MIN_SUFFIX + 1;
|
||||
for (size_t i = 0; i < len + 1; i++) {
|
||||
const Trie* node = &root;
|
||||
for (size_t j = i; j < len + 2; j++) {
|
||||
uint16_t c;
|
||||
if (j == 0 || j == len + 1) {
|
||||
c = '.'; // word boundary character in pattern data files
|
||||
} else {
|
||||
c = word[j - 1];
|
||||
if (c == CHAR_SOFT_HYPHEN) {
|
||||
hyphenateSoft(result, word, len);
|
||||
return;
|
||||
}
|
||||
// TODO: This uses ICU's simple character to character lowercasing, which ignores
|
||||
// the locale, and ignores cases when lowercasing a character results in more than
|
||||
// one character. It should be fixed to consider the locale (in order for it to work
|
||||
// correctly for Turkish and Azerbaijani), as well as support one-to-many, and
|
||||
// many-to-many case conversions (including non-BMP cases).
|
||||
if (c < 0x00C0) { // U+00C0 is the lowest uppercase non-ASCII character
|
||||
// Convert uppercase ASCII to lowercase ASCII, but keep other characters as-is
|
||||
if (0x0041 <= c && c <= 0x005A) {
|
||||
c += 0x0020;
|
||||
}
|
||||
} else {
|
||||
c = u_tolower(c);
|
||||
}
|
||||
const size_t paddedLen = len + 2; // start and stop code each count for 1
|
||||
if (patternData != nullptr &&
|
||||
(int)len >= MIN_PREFIX + MIN_SUFFIX && paddedLen <= MAX_HYPHENATED_SIZE) {
|
||||
uint16_t alpha_codes[MAX_HYPHENATED_SIZE];
|
||||
if (alphabetLookup(alpha_codes, word, len)) {
|
||||
hyphenateFromCodes(result->data(), alpha_codes, paddedLen);
|
||||
return;
|
||||
}
|
||||
// TODO: try NFC normalization
|
||||
// TODO: handle non-BMP Unicode (requires remapping of offsets)
|
||||
}
|
||||
hyphenateSoft(result->data(), word, len);
|
||||
}
|
||||
|
||||
// If any soft hyphen is present in the word, use soft hyphens to decide hyphenation,
|
||||
// as recommended in UAX #14 (Use of Soft Hyphen)
|
||||
void Hyphenator::hyphenateSoft(uint8_t* result, const uint16_t* word, size_t len) {
|
||||
result[0] = 0;
|
||||
for (size_t i = 1; i < len; i++) {
|
||||
result[i] = word[i - 1] == CHAR_SOFT_HYPHEN;
|
||||
}
|
||||
}
|
||||
|
||||
bool Hyphenator::alphabetLookup(uint16_t* alpha_codes, const uint16_t* word, size_t len) {
|
||||
const Header* header = getHeader();
|
||||
// TODO: check header magic
|
||||
uint32_t alphabetVersion = header->alphabetVersion();
|
||||
if (alphabetVersion == 0) {
|
||||
const AlphabetTable0* alphabet = header->alphabetTable0();
|
||||
uint32_t min_codepoint = alphabet->min_codepoint;
|
||||
uint32_t max_codepoint = alphabet->max_codepoint;
|
||||
alpha_codes[0] = 0; // word start
|
||||
for (size_t i = 0; i < len; i++) {
|
||||
uint16_t c = word[i];
|
||||
if (c < min_codepoint || c >= max_codepoint) {
|
||||
return false;
|
||||
}
|
||||
auto search = node->succ.find(c);
|
||||
if (search != node->succ.end()) {
|
||||
node = &search->second;
|
||||
uint8_t code = alphabet->data[c - min_codepoint];
|
||||
if (code == 0) {
|
||||
return false;
|
||||
}
|
||||
alpha_codes[i + 1] = code;
|
||||
}
|
||||
alpha_codes[len + 1] = 0; // word termination
|
||||
return true;
|
||||
} else if (alphabetVersion == 1) {
|
||||
const AlphabetTable1* alphabet = header->alphabetTable1();
|
||||
size_t n_entries = alphabet->n_entries;
|
||||
const uint32_t* begin = alphabet->data;
|
||||
const uint32_t* end = begin + n_entries;
|
||||
alpha_codes[0] = 0;
|
||||
for (size_t i = 0; i < len; i++) {
|
||||
uint16_t c = word[i];
|
||||
auto p = std::lower_bound(begin, end, c << 11);
|
||||
if (p == end) {
|
||||
return false;
|
||||
}
|
||||
uint32_t entry = *p;
|
||||
if (AlphabetTable1::codepoint(entry) != c) {
|
||||
return false;
|
||||
}
|
||||
alpha_codes[i + 1] = AlphabetTable1::value(entry);
|
||||
}
|
||||
alpha_codes[len + 1] = 0;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Internal implementation, after conversion to codes. All case folding and normalization
|
||||
* has been done by now, and all characters have been found in the alphabet.
|
||||
* Note: len here is the padded length including 0 codes at start and end.
|
||||
**/
|
||||
void Hyphenator::hyphenateFromCodes(uint8_t* result, const uint16_t* codes, size_t len) {
|
||||
const Header* header = getHeader();
|
||||
const Trie* trie = header->trieTable();
|
||||
const Pattern* pattern = header->patternTable();
|
||||
uint32_t char_mask = trie->char_mask;
|
||||
uint32_t link_shift = trie->link_shift;
|
||||
uint32_t link_mask = trie->link_mask;
|
||||
uint32_t pattern_shift = trie->pattern_shift;
|
||||
size_t maxOffset = len - MIN_SUFFIX - 1;
|
||||
for (size_t i = 0; i < len - 1; i++) {
|
||||
uint32_t node = 0; // index into Trie table
|
||||
for (size_t j = i; j < len; j++) {
|
||||
uint16_t c = codes[j];
|
||||
uint32_t entry = trie->data[node + c];
|
||||
if ((entry & char_mask) == c) {
|
||||
node = (entry & link_mask) >> link_shift;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
if (!node->result.empty()) {
|
||||
int resultLen = node->result.size();
|
||||
int offset = j + 1 - resultLen;
|
||||
uint32_t pat_ix = trie->data[node] >> pattern_shift;
|
||||
// pat_ix contains a 3-tuple of length, shift (number of trailing zeros), and an offset
|
||||
// into the buf pool. This is the pattern for the substring (i..j) we just matched,
|
||||
// which we combine (via point-wise max) into the result vector.
|
||||
if (pat_ix != 0) {
|
||||
uint32_t pat_entry = pattern->data[pat_ix];
|
||||
int pat_len = Pattern::len(pat_entry);
|
||||
int pat_shift = Pattern::shift(pat_entry);
|
||||
const uint8_t* pat_buf = pattern->buf(pat_entry);
|
||||
int offset = j + 1 - (pat_len + pat_shift);
|
||||
// offset is the index within result that lines up with the start of pat_buf
|
||||
int start = std::max(MIN_PREFIX - offset, 0);
|
||||
int end = std::min(resultLen, (int)maxOffset - offset);
|
||||
// TODO performance: this inner loop can profitably be optimized
|
||||
int end = std::min(pat_len, (int)maxOffset - offset);
|
||||
for (int k = start; k < end; k++) {
|
||||
(*result)[offset + k] = std::max((*result)[offset + k], node->result[k]);
|
||||
result[offset + k] = std::max(result[offset + k], pat_buf[k]);
|
||||
}
|
||||
#if 0
|
||||
// debug printing of matched patterns
|
||||
std::string dbg;
|
||||
for (size_t k = i; k <= j + 1; k++) {
|
||||
int off = k - j - 2 + resultLen;
|
||||
if (off >= 0 && node->result[off] != 0) {
|
||||
dbg.push_back((char)('0' + node->result[off]));
|
||||
}
|
||||
if (k < j + 1) {
|
||||
uint16_t c = (k == 0 || k == len + 1) ? '.' : word[k - 1];
|
||||
dbg.push_back((char)c);
|
||||
}
|
||||
}
|
||||
ALOGD("%d:%d %s", i, j, dbg.c_str());
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
// Since the above calculation does not modify values outside
|
||||
// [MIN_PREFIX, len - MIN_SUFFIX], they are left as 0.
|
||||
for (size_t i = MIN_PREFIX; i < maxOffset; i++) {
|
||||
(*result)[i] &= 1;
|
||||
result[i] &= 1;
|
||||
}
|
||||
}
|
||||
|
||||
Hyphenator* Hyphenator::load(const uint16_t *patternData, size_t size) {
|
||||
Hyphenator* result = new Hyphenator;
|
||||
for (size_t i = 0; i < size; i++) {
|
||||
size_t end = i;
|
||||
while (patternData[end] != '\n') end++;
|
||||
result->addPattern(patternData + i, end - i);
|
||||
i = end;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace android
|
||||
|
||||
567
engine/src/flutter/tools/mk_hyb_file.py
Executable file
567
engine/src/flutter/tools/mk_hyb_file.py
Executable file
@@ -0,0 +1,567 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright (C) 2015 The Android Open Source Project
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the 'License');
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an 'AS IS' BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""
|
||||
Convert hyphen files in standard TeX format (a trio of pat, chr, and hyp)
|
||||
into binary format. See doc/hyb_file_format.md for more information.
|
||||
|
||||
Usage: mk_hyb_file.py [-v] hyph-foo.pat.txt hyph-foo.hyb
|
||||
|
||||
Optional -v parameter turns on verbose debugging.
|
||||
|
||||
"""
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
import io
|
||||
import sys
|
||||
import struct
|
||||
import math
|
||||
import getopt
|
||||
|
||||
|
||||
VERBOSE = False
|
||||
|
||||
|
||||
if sys.version_info[0] >= 3:
|
||||
def unichr(x):
|
||||
return chr(x)
|
||||
|
||||
|
||||
# number of bits required to represent numbers up to n inclusive
|
||||
def num_bits(n):
|
||||
return 1 + int(math.log(n, 2)) if n > 0 else 0
|
||||
|
||||
|
||||
class Node:
|
||||
|
||||
def __init__(self):
|
||||
self.succ = {}
|
||||
self.res = None
|
||||
self.fsm_pat = None
|
||||
self.fail = None
|
||||
|
||||
|
||||
# List of free slots, implemented as doubly linked list
|
||||
class Freelist:
|
||||
|
||||
def __init__(self):
|
||||
self.first = None
|
||||
self.last = None
|
||||
self.pred = []
|
||||
self.succ = []
|
||||
|
||||
def grow(self):
|
||||
this = len(self.pred)
|
||||
self.pred.append(self.last)
|
||||
self.succ.append(None)
|
||||
if self.last is None:
|
||||
self.first = this
|
||||
else:
|
||||
self.succ[self.last] = this
|
||||
self.last = this
|
||||
|
||||
def next(self, cursor):
|
||||
if cursor == 0:
|
||||
cursor = self.first
|
||||
if cursor is None:
|
||||
self.grow()
|
||||
result = self.last
|
||||
else:
|
||||
result = cursor
|
||||
return result, self.succ[result]
|
||||
|
||||
def is_free(self, ix):
|
||||
while ix >= len(self.pred):
|
||||
self.grow()
|
||||
return self.pred[ix] != -1
|
||||
|
||||
def use(self, ix):
|
||||
if self.pred[ix] is None:
|
||||
self.first = self.succ[ix]
|
||||
else:
|
||||
self.succ[self.pred[ix]] = self.succ[ix]
|
||||
if self.succ[ix] is None:
|
||||
self.last = self.pred[ix]
|
||||
else:
|
||||
self.pred[self.succ[ix]] = self.pred[ix]
|
||||
if self.pred[ix] == -1:
|
||||
assert self.pred[ix] != -1, 'double free!'
|
||||
self.pred[ix] = -1
|
||||
|
||||
|
||||
def combine(a, b):
|
||||
if a is None: return b
|
||||
if b is None: return a
|
||||
if len(b) < len(a): a, b = b, a
|
||||
res = b[:len(b) - len(a)]
|
||||
for i in range(len(a)):
|
||||
res.append(max(a[i], b[i + len(b) - len(a)]))
|
||||
return res
|
||||
|
||||
|
||||
def trim(pattern):
|
||||
for ix in range(len(pattern)):
|
||||
if pattern[ix] != 0:
|
||||
return pattern[ix:]
|
||||
|
||||
|
||||
def pat_to_binary(pattern):
|
||||
return b''.join(struct.pack('B', x) for x in pattern)
|
||||
|
||||
|
||||
class Hyph:
|
||||
|
||||
def __init__(self):
|
||||
self.root = Node()
|
||||
self.root.str = '<root>'
|
||||
self.node_list = [self.root]
|
||||
|
||||
# Add a pattern (word fragment with numeric codes, such as ".ad4der")
|
||||
def add_pat(self, pat):
|
||||
lastWasLetter = False
|
||||
haveSeenNumber = False
|
||||
result = []
|
||||
word = ''
|
||||
for c in pat:
|
||||
if c.isdigit():
|
||||
result.append(int(c))
|
||||
lastWasLetter = False
|
||||
haveSeenNumber = True
|
||||
else:
|
||||
word += c
|
||||
if lastWasLetter and haveSeenNumber:
|
||||
result.append(0)
|
||||
lastWasLetter = True
|
||||
if lastWasLetter:
|
||||
result.append(0)
|
||||
|
||||
self.add_word_res(word, result)
|
||||
|
||||
# Add an exception (word with hyphens, such as "ta-ble")
|
||||
def add_exception(self, hyph_word):
|
||||
res = []
|
||||
word = ['.']
|
||||
need_10 = False
|
||||
for c in hyph_word:
|
||||
if c == '-':
|
||||
res.append(11)
|
||||
need_10 = False
|
||||
else:
|
||||
if need_10:
|
||||
res.append(10)
|
||||
word.append(c)
|
||||
need_10 = True
|
||||
word.append('.')
|
||||
res.append(0)
|
||||
res.append(0)
|
||||
if VERBOSE:
|
||||
print(word, res)
|
||||
self.add_word_res(''.join(word), res)
|
||||
|
||||
def add_word_res(self, word, result):
|
||||
if VERBOSE:
|
||||
print(word, result)
|
||||
|
||||
t = self.root
|
||||
s = ''
|
||||
for c in word:
|
||||
s += c
|
||||
if c not in t.succ:
|
||||
new_node = Node()
|
||||
new_node.str = s
|
||||
self.node_list.append(new_node)
|
||||
t.succ[c] = new_node
|
||||
t = t.succ[c]
|
||||
t.res = result
|
||||
|
||||
def pack(self, node_list, ch_map, use_node=False):
|
||||
size = 0
|
||||
self.node_map = {}
|
||||
nodes = Freelist()
|
||||
edges = Freelist()
|
||||
edge_start = 1 if use_node else 0
|
||||
for node in node_list:
|
||||
succ = sorted([ch_map[c] + edge_start for c in node.succ.keys()])
|
||||
if len(succ):
|
||||
cursor = 0
|
||||
while True:
|
||||
edge_ix, cursor = edges.next(cursor)
|
||||
ix = edge_ix - succ[0]
|
||||
if (ix >= 0 and nodes.is_free(ix) and
|
||||
all(edges.is_free(ix + s) for s in succ) and
|
||||
((not use_node) or edges.is_free(ix))):
|
||||
break
|
||||
elif use_node:
|
||||
ix, _ = edges.next(0)
|
||||
nodes.is_free(ix) # actually don't need nodes at all when use_node,
|
||||
# but keep it happy
|
||||
else:
|
||||
ix, _ = nodes.next(0)
|
||||
node.ix = ix
|
||||
self.node_map[ix] = node
|
||||
nodes.use(ix)
|
||||
size = max(size, ix)
|
||||
if use_node:
|
||||
edges.use(ix)
|
||||
for s in succ:
|
||||
edges.use(ix + s)
|
||||
size += max(ch_map.values()) + 1
|
||||
return size
|
||||
|
||||
# return list of nodes in bfs order
|
||||
def bfs(self, ch_map):
|
||||
result = [self.root]
|
||||
ix = 0
|
||||
while ix < len(result):
|
||||
node = result[ix]
|
||||
node.bfs_ix = ix
|
||||
mapped = {}
|
||||
for c, next in node.succ.items():
|
||||
assert ch_map[c] not in mapped, 'duplicate edge ' + node.str + ' ' + hex(ord(c))
|
||||
mapped[ch_map[c]] = next
|
||||
for i in sorted(mapped.keys()):
|
||||
result.append(mapped[i])
|
||||
ix += 1
|
||||
self.bfs_order = result
|
||||
return result
|
||||
|
||||
# suffix compression - convert the trie into an acyclic digraph, merging nodes when
|
||||
# the subtries are identical
|
||||
def dedup(self):
|
||||
uniques = []
|
||||
dupmap = {}
|
||||
dedup_ix = [0] * len(self.bfs_order)
|
||||
for ix in reversed(range(len(self.bfs_order))):
|
||||
# construct string representation of node
|
||||
node = self.bfs_order[ix]
|
||||
if node.res is None:
|
||||
s = ''
|
||||
else:
|
||||
s = ''.join(str(c) for c in node.res)
|
||||
for c in sorted(node.succ.keys()):
|
||||
succ = node.succ[c]
|
||||
s += ' ' + c + str(dedup_ix[succ.bfs_ix])
|
||||
if s in dupmap:
|
||||
dedup_ix[ix] = dupmap[s]
|
||||
else:
|
||||
uniques.append(node)
|
||||
dedup_ix[ix] = ix
|
||||
dupmap[s] = dedup_ix[ix]
|
||||
uniques.reverse()
|
||||
print(len(uniques), 'unique nodes,', len(self.bfs_order), 'total')
|
||||
return dedup_ix, uniques
|
||||
|
||||
|
||||
# load the ".pat" file, which contains patterns such as a1b2c3
|
||||
def load(fn):
|
||||
hyph = Hyph()
|
||||
with io.open(fn, encoding='UTF-8') as f:
|
||||
for l in f:
|
||||
pat = l.strip()
|
||||
hyph.add_pat(pat)
|
||||
return hyph
|
||||
|
||||
|
||||
# load the ".chr" file, which contains the alphabet and case pairs, eg "aA", "bB" etc.
|
||||
def load_chr(fn):
|
||||
ch_map = {'.': 0}
|
||||
with io.open(fn, encoding='UTF-8') as f:
|
||||
for i, l in enumerate(f):
|
||||
l = l.strip()
|
||||
if len(l) > 2:
|
||||
# lowercase maps to multi-character uppercase sequence, ignore uppercase for now
|
||||
l = l[:1]
|
||||
else:
|
||||
assert len(l) == 2, 'expected 2 chars in chr'
|
||||
for c in l:
|
||||
ch_map[c] = i + 1
|
||||
return ch_map
|
||||
|
||||
|
||||
# load exceptions with explicit hyphens
|
||||
def load_hyp(hyph, fn):
|
||||
with io.open(fn, encoding='UTF-8') as f:
|
||||
for l in f:
|
||||
hyph.add_exception(l.strip())
|
||||
|
||||
|
||||
def generate_header(alphabet, trie, pattern):
|
||||
alphabet_off = 6 * 4
|
||||
trie_off = alphabet_off + len(alphabet)
|
||||
pattern_off = trie_off + len(trie)
|
||||
file_size = pattern_off + len(pattern)
|
||||
data = [0x62ad7968, 0, alphabet_off, trie_off, pattern_off, file_size]
|
||||
return struct.pack('<6I', *data)
|
||||
|
||||
|
||||
def generate_alphabet(ch_map):
|
||||
ch_map = ch_map.copy()
|
||||
del ch_map['.']
|
||||
min_ch = ord(min(ch_map))
|
||||
max_ch = ord(max(ch_map))
|
||||
if max_ch - min_ch < 1024 and max(ch_map.values()) < 256:
|
||||
# generate format 0
|
||||
data = [0] * (max_ch - min_ch + 1)
|
||||
for c, val in ch_map.items():
|
||||
data[ord(c) - min_ch] = val
|
||||
result = [struct.pack('<3I', 0, min_ch, max_ch + 1)]
|
||||
for b in data:
|
||||
result.append(struct.pack('<B', b))
|
||||
else:
|
||||
# generate format 1
|
||||
assert max(ch_map.values()) < 2048, 'max number of unique characters exceeded'
|
||||
result = [struct.pack('<2I', 1, len(ch_map))]
|
||||
for c, val in sorted(ch_map.items()):
|
||||
data = (ord(c) << 11) | val
|
||||
result.append(struct.pack('<I', data))
|
||||
binary = b''.join(result)
|
||||
if len(binary) % 4 != 0:
|
||||
binary += b'\x00' * (4 - len(binary) % 4)
|
||||
return binary
|
||||
|
||||
|
||||
# assumes hyph structure has been packed, ie node.ix values have been set
|
||||
def generate_trie(hyph, ch_map, n_trie, dedup_ix, dedup_nodes, patmap):
|
||||
ch_array = [0] * n_trie
|
||||
link_array = [0] * n_trie
|
||||
pat_array = [0] * n_trie
|
||||
link_shift = num_bits(max(ch_map.values()))
|
||||
char_mask = (1 << link_shift) - 1
|
||||
pattern_shift = link_shift + num_bits(n_trie - 1)
|
||||
link_mask = (1 << pattern_shift) - (1 << link_shift)
|
||||
result = [struct.pack('<6I', 0, char_mask, link_shift, link_mask, pattern_shift, n_trie)]
|
||||
|
||||
for node in dedup_nodes:
|
||||
ix = node.ix
|
||||
if node.res is not None:
|
||||
pat_array[ix] = patmap[pat_to_binary(node.res)]
|
||||
for c, next in node.succ.items():
|
||||
c_num = ch_map[c]
|
||||
link_ix = ix + c_num
|
||||
ch_array[link_ix] = c_num
|
||||
if dedup_ix is None:
|
||||
dedup_next = next
|
||||
else:
|
||||
dedup_next = hyph.bfs_order[dedup_ix[next.bfs_ix]]
|
||||
link_array[link_ix] = dedup_next.ix
|
||||
|
||||
for i in range(n_trie):
|
||||
#print((pat_array[i], link_array[i], ch_array[i]))
|
||||
packed = (pat_array[i] << pattern_shift) | (link_array[i] << link_shift) | ch_array[i]
|
||||
result.append(struct.pack('<I', packed))
|
||||
return b''.join(result)
|
||||
|
||||
|
||||
def generate_pattern(pats):
|
||||
pat_array = [0]
|
||||
patmap = {b'': 0}
|
||||
|
||||
raw_pat_array = []
|
||||
raw_pat_size = 0
|
||||
raw_patmap = {}
|
||||
|
||||
for pat in pats:
|
||||
if pat is None:
|
||||
continue
|
||||
pat_str = pat_to_binary(pat)
|
||||
if pat_str not in patmap:
|
||||
shift = 0
|
||||
while shift < len(pat) and pat[len(pat) - shift - 1] == 0:
|
||||
shift += 1
|
||||
rawpat = pat_str[:len(pat) - shift]
|
||||
if rawpat not in raw_patmap:
|
||||
raw_patmap[rawpat] = raw_pat_size
|
||||
raw_pat_array.append(rawpat)
|
||||
raw_pat_size += len(rawpat)
|
||||
data = (len(rawpat) << 26) | (shift << 20) | raw_patmap[rawpat]
|
||||
patmap[pat_str] = len(pat_array)
|
||||
pat_array.append(data)
|
||||
data = [0, len(pat_array), 16 + 4 * len(pat_array), raw_pat_size]
|
||||
result = [struct.pack('<4I', *data)]
|
||||
for x in pat_array:
|
||||
result.append(struct.pack('<I', x))
|
||||
result.extend(raw_pat_array)
|
||||
return patmap, b''.join(result)
|
||||
|
||||
|
||||
def generate_hyb_file(hyph, ch_map, hyb_fn):
|
||||
bfs = hyph.bfs(ch_map)
|
||||
dedup_ix, dedup_nodes = hyph.dedup()
|
||||
n_trie = hyph.pack(dedup_nodes, ch_map)
|
||||
alphabet = generate_alphabet(ch_map)
|
||||
patmap, pattern = generate_pattern([n.res for n in hyph.node_list])
|
||||
trie = generate_trie(hyph, ch_map, n_trie, dedup_ix, dedup_nodes, patmap)
|
||||
header = generate_header(alphabet, trie, pattern)
|
||||
|
||||
with open(hyb_fn, 'wb') as f:
|
||||
f.write(header)
|
||||
f.write(alphabet)
|
||||
f.write(trie)
|
||||
f.write(pattern)
|
||||
|
||||
|
||||
# Verify that the file contains the same lines as the lines argument, in arbitrary order
|
||||
def verify_file_sorted(lines, fn):
|
||||
file_lines = [l.strip() for l in io.open(fn)]
|
||||
line_set = set(lines)
|
||||
file_set = set(file_lines)
|
||||
if line_set == file_set:
|
||||
return True
|
||||
for line in line_set - file_set:
|
||||
print(repr(line) + ' in reconstruction, not in file')
|
||||
for line in file_set - line_set:
|
||||
print(repr(line) + ' in file, not in reconstruction')
|
||||
return False
|
||||
|
||||
|
||||
def map_to_chr(alphabet_map):
|
||||
result = []
|
||||
ch_map = {}
|
||||
for val in alphabet_map.values():
|
||||
chs = [ch for ch in alphabet_map if alphabet_map[ch] == val]
|
||||
# non-cased characters (like Ethopic) are in both, matching chr file
|
||||
lowercase = [ch for ch in chs if not ch.isupper()]
|
||||
uppercase = [ch for ch in chs if not ch.islower()]
|
||||
# print(val, `lowercase`, `uppercase`)
|
||||
assert len(lowercase) == 1, 'expected 1 lowercase character'
|
||||
assert 0 <= len(uppercase) <= 1, 'expected 0 or 1 uppercase character'
|
||||
ch_map[val] = lowercase[0]
|
||||
result.append(''.join(lowercase + uppercase))
|
||||
ch_map[0] = '.'
|
||||
return (ch_map, result)
|
||||
|
||||
|
||||
def get_pattern(pattern_data, ix):
|
||||
pattern_offset = struct.unpack('<I', pattern_data[8:12])[0]
|
||||
entry = struct.unpack('<I', pattern_data[16 + ix * 4: 16 + ix * 4 + 4])[0]
|
||||
pat_len = entry >> 26
|
||||
pat_shift = (entry >> 20) & 0x1f
|
||||
offset = pattern_offset + (entry & 0xfffff)
|
||||
return pattern_data[offset: offset + pat_len] + b'\0' * pat_shift
|
||||
|
||||
|
||||
def traverse_trie(ix, s, trie_data, ch_map, pattern_data, patterns, exceptions):
|
||||
(char_mask, link_shift, link_mask, pattern_shift) = struct.unpack('<4I', trie_data[4:20])
|
||||
node_entry = struct.unpack('<I', trie_data[24 + ix * 4: 24 + ix * 4 + 4])[0]
|
||||
pattern = node_entry >> pattern_shift
|
||||
if pattern:
|
||||
result = []
|
||||
is_exception = False
|
||||
pat = get_pattern(pattern_data, pattern)
|
||||
for i in range(len(s) + 1):
|
||||
pat_off = i - 1 + len(pat) - len(s)
|
||||
if pat_off < 0:
|
||||
code = 0
|
||||
else:
|
||||
code = struct.unpack('B', pat[pat_off : pat_off + 1])[0]
|
||||
if 1 <= code <= 9:
|
||||
result.append('%d' % code)
|
||||
elif code == 10:
|
||||
is_exception = True
|
||||
elif code == 11:
|
||||
result.append('-')
|
||||
is_exception = True
|
||||
else:
|
||||
assert code == 0, 'unexpected code'
|
||||
if i < len(s):
|
||||
result.append(s[i])
|
||||
pat_str = ''.join(result)
|
||||
#print(`pat_str`, `pat`)
|
||||
if is_exception:
|
||||
assert pat_str[0] == '.', "expected leading '.'"
|
||||
assert pat_str[-1] == '.', "expected trailing '.'"
|
||||
exceptions.append(pat_str[1:-1]) # strip leading and trailing '.'
|
||||
else:
|
||||
patterns.append(pat_str)
|
||||
for ch in ch_map:
|
||||
edge_entry = struct.unpack('<I', trie_data[24 + (ix + ch) * 4: 24 + (ix + ch) * 4 + 4])[0]
|
||||
link = (edge_entry & link_mask) >> link_shift
|
||||
if link != 0 and ch == (edge_entry & char_mask):
|
||||
sch = s + ch_map[ch]
|
||||
traverse_trie(link, sch, trie_data, ch_map, pattern_data, patterns, exceptions)
|
||||
|
||||
|
||||
# Verify the generated binary file by reconstructing the textual representations
|
||||
# from the binary hyb file, then checking that they're identical (mod the order of
|
||||
# lines within the file, which is irrelevant). This function makes assumptions that
|
||||
# are stronger than absolutely necessary (in particular, that the patterns are in
|
||||
# lowercase as defined by python islower).
|
||||
def verify_hyb_file(hyb_fn, pat_fn, chr_fn, hyp_fn):
|
||||
with open(hyb_fn, 'rb') as f:
|
||||
hyb_data = f.read()
|
||||
header = hyb_data[0: 6 * 4]
|
||||
(magic, version, alphabet_off, trie_off, pattern_off, file_size) = struct.unpack('<6I', header)
|
||||
alphabet_data = hyb_data[alphabet_off:trie_off]
|
||||
trie_data = hyb_data[trie_off:pattern_off]
|
||||
pattern_data = hyb_data[pattern_off:file_size]
|
||||
|
||||
# reconstruct alphabet table
|
||||
alphabet_version = struct.unpack('<I', alphabet_data[:4])[0]
|
||||
alphabet_map = {}
|
||||
if alphabet_version == 0:
|
||||
(min_ch, max_ch) = struct.unpack('<2I', alphabet_data[4:12])
|
||||
for ch in range(min_ch, max_ch):
|
||||
offset = 12 + ch - min_ch
|
||||
b = struct.unpack('B', alphabet_data[offset : offset + 1])[0]
|
||||
if b != 0:
|
||||
alphabet_map[unichr(ch)] = b
|
||||
else:
|
||||
assert alphabet_version == 1
|
||||
n_entries = struct.unpack('<I', alphabet_data[4:8])[0]
|
||||
for i in range(n_entries):
|
||||
entry = struct.unpack('<I', alphabet_data[8 + 4 * i: 8 + 4 * i + 4])[0]
|
||||
alphabet_map[unichr(entry >> 11)] = entry & 0x7ff
|
||||
|
||||
ch_map, reconstructed_chr = map_to_chr(alphabet_map)
|
||||
|
||||
# EXCEPTION for Armenian (hy), we don't really deal with the uppercase form of U+0587
|
||||
if u'\u0587' in reconstructed_chr:
|
||||
reconstructed_chr.remove(u'\u0587')
|
||||
reconstructed_chr.append(u'\u0587\u0535\u0552')
|
||||
|
||||
assert verify_file_sorted(reconstructed_chr, chr_fn), 'alphabet table not verified'
|
||||
|
||||
# reconstruct trie
|
||||
patterns = []
|
||||
exceptions = []
|
||||
traverse_trie(0, '', trie_data, ch_map, pattern_data, patterns, exceptions)
|
||||
assert verify_file_sorted(patterns, pat_fn), 'pattern table not verified'
|
||||
assert verify_file_sorted(exceptions, hyp_fn), 'exception table not verified'
|
||||
|
||||
|
||||
def main():
|
||||
global VERBOSE
|
||||
try:
|
||||
opts, args = getopt.getopt(sys.argv[1:], 'v')
|
||||
except getopt.GetoptError as err:
|
||||
print(str(err))
|
||||
sys.exit(1)
|
||||
for o, _ in opts:
|
||||
if o == '-v':
|
||||
VERBOSE = True
|
||||
pat_fn, out_fn = args
|
||||
hyph = load(pat_fn)
|
||||
if pat_fn.endswith('.pat.txt'):
|
||||
chr_fn = pat_fn[:-8] + '.chr.txt'
|
||||
ch_map = load_chr(chr_fn)
|
||||
hyp_fn = pat_fn[:-8] + '.hyp.txt'
|
||||
load_hyp(hyph, hyp_fn)
|
||||
generate_hyb_file(hyph, ch_map, out_fn)
|
||||
verify_hyb_file(out_fn, pat_fn, chr_fn, hyp_fn)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user