22 #include <unordered_map> 25 #include "UTF8StringSlice.hpp" 31 typedef UTF8StringSlice::LengthType LengthType;
39 void Extract(
const std::string& text) {
43 CalculateSuffixEntropy();
46 CalculatePrefixEntropy();
48 ExtractWordCandidates();
53 void SetFullText(
const std::string& fullText) {
57 void SetFullText(
const char* fullText) {
61 void SetFullText(
const UTF8StringSlice& fullText) { utf8FullText = fullText; }
63 void SetWordMinLength(
const LengthType _wordMinLength) {
64 wordMinLength = _wordMinLength;
67 void SetWordMaxLength(
const LengthType _wordMaxLength) {
68 wordMaxLength = _wordMaxLength;
71 void SetPrefixSetLength(
const LengthType _prefixSetLength) {
72 prefixSetLength = _prefixSetLength;
75 void SetSuffixSetLength(
const LengthType _suffixSetLength) {
76 suffixSetLength = _suffixSetLength;
80 void SetPreCalculationFilter(
82 const UTF8StringSlice8Bit&)>& filter) {
83 preCalculationFilter = filter;
86 void SetPostCalculationFilter(
88 const UTF8StringSlice8Bit&)>& filter) {
89 postCalculationFilter = filter;
92 void ReleaseSuffixes() { std::vector<UTF8StringSlice8Bit>().swap(suffixes); }
94 void ReleasePrefixes() { std::vector<UTF8StringSlice8Bit>().swap(prefixes); }
96 const std::vector<UTF8StringSlice8Bit>& Words()
const {
return words; }
98 const std::vector<UTF8StringSlice8Bit>& WordCandidates()
const {
99 return wordCandidates;
105 double suffixEntropy;
106 double prefixEntropy;
109 const Signals& Signal(
const UTF8StringSlice8Bit& wordCandidate)
const;
111 double Cohesion(
const UTF8StringSlice8Bit& wordCandidate)
const;
113 double Entropy(
const UTF8StringSlice8Bit& wordCandidate)
const;
115 double SuffixEntropy(
const UTF8StringSlice8Bit& wordCandidate)
const;
117 double PrefixEntropy(
const UTF8StringSlice8Bit& wordCandidate)
const;
119 size_t Frequency(
const UTF8StringSlice8Bit& word)
const;
121 double Probability(
const UTF8StringSlice8Bit& word)
const;
123 double LogProbability(
const UTF8StringSlice8Bit& word)
const;
127 void ExtractSuffixes();
129 void ExtractPrefixes();
131 void ExtractWordCandidates();
133 void CalculateFrequency();
135 void CalculateCohesions();
137 void CalculateSuffixEntropy();
139 void CalculatePrefixEntropy();
155 double PMI(
const UTF8StringSlice8Bit& wordCandidate,
156 const UTF8StringSlice8Bit& part1,
157 const UTF8StringSlice8Bit& part2)
const;
159 double CalculateCohesion(
const UTF8StringSlice8Bit& wordCandidate)
const;
161 double CalculateEntropy(
162 const std::unordered_map<UTF8StringSlice8Bit,
size_t,
165 LengthType wordMinLength;
166 LengthType wordMaxLength;
167 LengthType prefixSetLength;
168 LengthType suffixSetLength;
169 std::function<bool(const PhraseExtract&, const UTF8StringSlice8Bit&)>
170 preCalculationFilter;
171 std::function<bool(const PhraseExtract&, const UTF8StringSlice8Bit&)>
172 postCalculationFilter;
174 bool prefixesExtracted;
175 bool suffixesExtracted;
176 bool frequenciesCalculated;
177 bool wordCandidatesExtracted;
178 bool cohesionsCalculated;
179 bool prefixEntropiesCalculated;
180 bool suffixEntropiesCalculated;
184 size_t totalOccurrence;
185 double logTotalOccurrence;
186 std::vector<UTF8StringSlice8Bit> prefixes;
187 std::vector<UTF8StringSlice8Bit> suffixes;
188 std::vector<UTF8StringSlice8Bit> wordCandidates;
189 std::vector<UTF8StringSlice8Bit> words;
192 friend class PhraseExtractTest;
Definition: UTF8StringSlice.hpp:202
Definition: Performance.cpp:16
Definition: UTF8StringSlice.hpp:54