ICU 76.1  76.1
normalizer2.h
Go to the documentation of this file.
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 * Copyright (C) 2009-2013, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 *******************************************************************************
10 * file name: normalizer2.h
11 * encoding: UTF-8
12 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created on: 2009nov22
16 * created by: Markus W. Scherer
17 */
18 
19 #ifndef __NORMALIZER2_H__
20 #define __NORMALIZER2_H__
21 
27 #include "unicode/utypes.h"
28 
29 #if U_SHOW_CPLUSPLUS_API
30 
31 #if !UCONFIG_NO_NORMALIZATION
32 
33 #include "unicode/stringpiece.h"
34 #include "unicode/uniset.h"
35 #include "unicode/unistr.h"
36 #include "unicode/unorm2.h"
37 
38 U_NAMESPACE_BEGIN
39 
40 class ByteSink;
41 
86 public:
91  ~Normalizer2();
92 
104  static const Normalizer2 *
105  getNFCInstance(UErrorCode &errorCode);
106 
118  static const Normalizer2 *
119  getNFDInstance(UErrorCode &errorCode);
120 
132  static const Normalizer2 *
133  getNFKCInstance(UErrorCode &errorCode);
134 
146  static const Normalizer2 *
147  getNFKDInstance(UErrorCode &errorCode);
148 
163  static const Normalizer2 *
164  getNFKCCasefoldInstance(UErrorCode &errorCode);
165 
180  static const Normalizer2 *
181  getNFKCSimpleCasefoldInstance(UErrorCode &errorCode);
182 
204  static const Normalizer2 *
205  getInstance(const char *packageName,
206  const char *name,
207  UNormalization2Mode mode,
208  UErrorCode &errorCode);
209 
221  normalize(const UnicodeString &src, UErrorCode &errorCode) const {
222  UnicodeString result;
223  normalize(src, result, errorCode);
224  return result;
225  }
239  virtual UnicodeString &
240  normalize(const UnicodeString &src,
241  UnicodeString &dest,
242  UErrorCode &errorCode) const = 0;
243 
266  virtual void
267  normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
268  Edits *edits, UErrorCode &errorCode) const;
269 
284  virtual UnicodeString &
285  normalizeSecondAndAppend(UnicodeString &first,
286  const UnicodeString &second,
287  UErrorCode &errorCode) const = 0;
302  virtual UnicodeString &
303  append(UnicodeString &first,
304  const UnicodeString &second,
305  UErrorCode &errorCode) const = 0;
306 
320  virtual UBool
321  getDecomposition(UChar32 c, UnicodeString &decomposition) const = 0;
322 
347  virtual UBool
348  getRawDecomposition(UChar32 c, UnicodeString &decomposition) const;
349 
365  virtual UChar32
366  composePair(UChar32 a, UChar32 b) const;
367 
376  virtual uint8_t
377  getCombiningClass(UChar32 c) const;
378 
393  virtual UBool
394  isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0;
414  virtual UBool
415  isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const;
416 
417 
434  quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = 0;
435 
458  virtual int32_t
459  spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = 0;
460 
474  virtual UBool hasBoundaryBefore(UChar32 c) const = 0;
475 
490  virtual UBool hasBoundaryAfter(UChar32 c) const = 0;
491 
505  virtual UBool isInert(UChar32 c) const = 0;
506 };
507 
520 public:
531  FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet) :
532  norm2(n2), set(filterSet) {}
533 
539 
553  virtual UnicodeString &
554  normalize(const UnicodeString &src,
555  UnicodeString &dest,
556  UErrorCode &errorCode) const override;
557 
580  virtual void
581  normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
582  Edits *edits, UErrorCode &errorCode) const override;
583 
598  virtual UnicodeString &
600  const UnicodeString &second,
601  UErrorCode &errorCode) const override;
616  virtual UnicodeString &
617  append(UnicodeString &first,
618  const UnicodeString &second,
619  UErrorCode &errorCode) const override;
620 
632  virtual UBool
633  getDecomposition(UChar32 c, UnicodeString &decomposition) const override;
634 
646  virtual UBool
647  getRawDecomposition(UChar32 c, UnicodeString &decomposition) const override;
648 
659  virtual UChar32
660  composePair(UChar32 a, UChar32 b) const override;
661 
670  virtual uint8_t
671  getCombiningClass(UChar32 c) const override;
672 
684  virtual UBool
685  isNormalized(const UnicodeString &s, UErrorCode &errorCode) const override;
705  virtual UBool
706  isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const override;
719  quickCheck(const UnicodeString &s, UErrorCode &errorCode) const override;
731  virtual int32_t
732  spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const override;
733 
742  virtual UBool hasBoundaryBefore(UChar32 c) const override;
743 
752  virtual UBool hasBoundaryAfter(UChar32 c) const override;
753 
761  virtual UBool isInert(UChar32 c) const override;
762 private:
763  UnicodeString &
764  normalize(const UnicodeString &src,
765  UnicodeString &dest,
766  USetSpanCondition spanCondition,
767  UErrorCode &errorCode) const;
768 
769  void
770  normalizeUTF8(uint32_t options, const char *src, int32_t length,
771  ByteSink &sink, Edits *edits,
772  USetSpanCondition spanCondition,
773  UErrorCode &errorCode) const;
774 
775  UnicodeString &
777  const UnicodeString &second,
778  UBool doNormalize,
779  UErrorCode &errorCode) const;
780 
781  const Normalizer2 &norm2;
782  const UnicodeSet &set;
783 };
784 
785 U_NAMESPACE_END
786 
787 #endif // !UCONFIG_NO_NORMALIZATION
788 
789 #endif /* U_SHOW_CPLUSPLUS_API */
790 
791 #endif // __NORMALIZER2_H__
virtual UBool hasBoundaryBefore(UChar32 c) const =0
Tests if the character always has a normalization boundary before it, regardless of context...
virtual int32_t spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const =0
Returns the end of the normalized substring of the input string.
C++ API: Unicode String.
virtual UBool getRawDecomposition(UChar32 c, UnicodeString &decomposition) const
Gets the raw decomposition mapping of c.
A ByteSink can be filled with bytes.
Definition: bytestream.h:53
virtual uint8_t getCombiningClass(UChar32 c) const
Gets the combining class of c.
UnicodeString normalize(const UnicodeString &src, UErrorCode &errorCode) const
Returns the normalized form of the source string.
Definition: normalizer2.h:221
Records lengths of string edits but not replacement text.
Definition: edits.h:80
C++ API: StringPiece: Read-only byte string wrapper class.
virtual UnicodeString & append(UnicodeString &first, const UnicodeString &second, UErrorCode &errorCode) const =0
Appends the second string to the first string (merging them at the boundary) and returns the first st...
Unicode normalization functionality for standard Unicode normalization or for using custom mapping ta...
Definition: normalizer2.h:85
C API: New API for Unicode Normalization.
virtual UBool hasBoundaryAfter(UChar32 c) const =0
Tests if the character always has a normalization boundary after it, regardless of context...
int32_t UChar32
Define UChar32 as a type for single Unicode code points.
Definition: umachine.h:427
virtual UChar32 composePair(UChar32 a, UChar32 b) const
Performs pairwise composition of a & b and returns the composite if there is one. ...
FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet)
Constructs a filtered normalizer wrapping any Normalizer2 instance and a filter set.
Definition: normalizer2.h:531
A mutable set of Unicode characters and multicharacter strings.
Definition: uniset.h:285
USetSpanCondition
Argument values for whether span() and similar functions continue while the current character is cont...
Definition: uset.h:185
virtual UNormalizationCheckResult quickCheck(const UnicodeString &s, UErrorCode &errorCode) const =0
Tests if the string is normalized.
virtual UBool isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const
Tests if the UTF-8 string is normalized.
UNormalization2Mode
Constants for normalization modes.
Definition: unorm2.h:48
virtual UnicodeString & normalizeSecondAndAppend(UnicodeString &first, const UnicodeString &second, UErrorCode &errorCode) const =0
Appends the normalized form of the second string to the first string (merging them at the boundary) a...
UErrorCode
Standard ICU4C error code type, a substitute for exceptions.
Definition: utypes.h:430
virtual UBool isNormalized(const UnicodeString &s, UErrorCode &errorCode) const =0
Tests if the string is normalized.
virtual UBool getDecomposition(UChar32 c, UnicodeString &decomposition) const =0
Gets the decomposition mapping of c.
virtual void normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink, Edits *edits, UErrorCode &errorCode) const
Normalizes a UTF-8 string and optionally records how source substrings relate to changed and unchange...
Basic definitions for ICU, for both C and C++ APIs.
virtual UBool isInert(UChar32 c) const =0
Tests if the character is normalization-inert.
#define U_COMMON_API
Set to export library symbols from inside the common library, and to import them from outside...
Definition: utypes.h:315
UnicodeString is a string class that stores Unicode characters directly and provides similar function...
Definition: unistr.h:295
A string-like object that points to a sized piece of memory.
Definition: stringpiece.h:61
UObject is the common ICU "boilerplate" class.
Definition: uobject.h:223
Normalization filtered by a UnicodeSet.
Definition: normalizer2.h:519
UNormalizationCheckResult
Result values for normalization quick check functions.
Definition: unorm2.h:97
int8_t UBool
The ICU boolean type, a signed-byte integer.
Definition: umachine.h:247
C++ API: Unicode Set.