utf8proc
C library for processing UTF-8 Unicode data
Loading...
Searching...
No Matches
utf8proc.h File Reference
#include <stdlib.h>
#include <stddef.h>
#include <stdbool.h>
#include <stdint.h>
#include <limits.h>

Go to the source code of this file.

Data Structures

struct  utf8proc_property_struct
 

Macros

#define UTF8PROC_DLLEXPORT
 
API version

The utf8proc API version MAJOR.MINOR.PATCH, following semantic-versioning rules (http://semver.org) based on API compatibility.

This is also returned at runtime by utf8proc_version(); however, the runtime version may append a string like "-dev" to the version number for prerelease versions.

Note
The shared-library version number in the Makefile (and CMakeLists.txt, and MANIFEST) may be different, being based on ABI compatibility rather than API compatibility.
#define UTF8PROC_VERSION_MAJOR   2
 
#define UTF8PROC_VERSION_MINOR   10
 
#define UTF8PROC_VERSION_PATCH   0
 
Error codes

Error codes being returned by almost all functions.

#define UTF8PROC_ERROR_NOMEM   -1
 
#define UTF8PROC_ERROR_OVERFLOW   -2
 
#define UTF8PROC_ERROR_INVALIDUTF8   -3
 
#define UTF8PROC_ERROR_NOTASSIGNED   -4
 
#define UTF8PROC_ERROR_INVALIDOPTS   -5
 

Typedefs

typedef int8_t utf8proc_int8_t
 
typedef uint8_t utf8proc_uint8_t
 
typedef int16_t utf8proc_int16_t
 
typedef uint16_t utf8proc_uint16_t
 
typedef int32_t utf8proc_int32_t
 
typedef uint32_t utf8proc_uint32_t
 
typedef size_t utf8proc_size_t
 
typedef ptrdiff_t utf8proc_ssize_t
 
typedef bool utf8proc_bool
 
typedef utf8proc_int16_t utf8proc_propval_t
 
typedef struct utf8proc_property_struct utf8proc_property_t
 
typedef utf8proc_int32_t(* utf8proc_custom_func) (utf8proc_int32_t codepoint, void *data)
 

Enumerations

enum  utf8proc_option_t {
  UTF8PROC_NULLTERM = (1<<0) ,
  UTF8PROC_STABLE = (1<<1) ,
  UTF8PROC_COMPAT = (1<<2) ,
  UTF8PROC_COMPOSE = (1<<3) ,
  UTF8PROC_DECOMPOSE = (1<<4) ,
  UTF8PROC_IGNORE = (1<<5) ,
  UTF8PROC_REJECTNA = (1<<6) ,
  UTF8PROC_NLF2LS = (1<<7) ,
  UTF8PROC_NLF2PS = (1<<8) ,
  UTF8PROC_NLF2LF = (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS) ,
  UTF8PROC_STRIPCC = (1<<9) ,
  UTF8PROC_CASEFOLD = (1<<10) ,
  UTF8PROC_CHARBOUND = (1<<11) ,
  UTF8PROC_LUMP = (1<<12) ,
  UTF8PROC_STRIPMARK = (1<<13) ,
  UTF8PROC_STRIPNA = (1<<14)
}
 
enum  utf8proc_category_t {
  UTF8PROC_CATEGORY_CN = 0 ,
  UTF8PROC_CATEGORY_LU = 1 ,
  UTF8PROC_CATEGORY_LL = 2 ,
  UTF8PROC_CATEGORY_LT = 3 ,
  UTF8PROC_CATEGORY_LM = 4 ,
  UTF8PROC_CATEGORY_LO = 5 ,
  UTF8PROC_CATEGORY_MN = 6 ,
  UTF8PROC_CATEGORY_MC = 7 ,
  UTF8PROC_CATEGORY_ME = 8 ,
  UTF8PROC_CATEGORY_ND = 9 ,
  UTF8PROC_CATEGORY_NL = 10 ,
  UTF8PROC_CATEGORY_NO = 11 ,
  UTF8PROC_CATEGORY_PC = 12 ,
  UTF8PROC_CATEGORY_PD = 13 ,
  UTF8PROC_CATEGORY_PS = 14 ,
  UTF8PROC_CATEGORY_PE = 15 ,
  UTF8PROC_CATEGORY_PI = 16 ,
  UTF8PROC_CATEGORY_PF = 17 ,
  UTF8PROC_CATEGORY_PO = 18 ,
  UTF8PROC_CATEGORY_SM = 19 ,
  UTF8PROC_CATEGORY_SC = 20 ,
  UTF8PROC_CATEGORY_SK = 21 ,
  UTF8PROC_CATEGORY_SO = 22 ,
  UTF8PROC_CATEGORY_ZS = 23 ,
  UTF8PROC_CATEGORY_ZL = 24 ,
  UTF8PROC_CATEGORY_ZP = 25 ,
  UTF8PROC_CATEGORY_CC = 26 ,
  UTF8PROC_CATEGORY_CF = 27 ,
  UTF8PROC_CATEGORY_CS = 28 ,
  UTF8PROC_CATEGORY_CO = 29
}
 
enum  utf8proc_bidi_class_t {
  UTF8PROC_BIDI_CLASS_L = 1 ,
  UTF8PROC_BIDI_CLASS_LRE = 2 ,
  UTF8PROC_BIDI_CLASS_LRO = 3 ,
  UTF8PROC_BIDI_CLASS_R = 4 ,
  UTF8PROC_BIDI_CLASS_AL = 5 ,
  UTF8PROC_BIDI_CLASS_RLE = 6 ,
  UTF8PROC_BIDI_CLASS_RLO = 7 ,
  UTF8PROC_BIDI_CLASS_PDF = 8 ,
  UTF8PROC_BIDI_CLASS_EN = 9 ,
  UTF8PROC_BIDI_CLASS_ES = 10 ,
  UTF8PROC_BIDI_CLASS_ET = 11 ,
  UTF8PROC_BIDI_CLASS_AN = 12 ,
  UTF8PROC_BIDI_CLASS_CS = 13 ,
  UTF8PROC_BIDI_CLASS_NSM = 14 ,
  UTF8PROC_BIDI_CLASS_BN = 15 ,
  UTF8PROC_BIDI_CLASS_B = 16 ,
  UTF8PROC_BIDI_CLASS_S = 17 ,
  UTF8PROC_BIDI_CLASS_WS = 18 ,
  UTF8PROC_BIDI_CLASS_ON = 19 ,
  UTF8PROC_BIDI_CLASS_LRI = 20 ,
  UTF8PROC_BIDI_CLASS_RLI = 21 ,
  UTF8PROC_BIDI_CLASS_FSI = 22 ,
  UTF8PROC_BIDI_CLASS_PDI = 23
}
 
enum  utf8proc_decomp_type_t {
  UTF8PROC_DECOMP_TYPE_FONT = 1 ,
  UTF8PROC_DECOMP_TYPE_NOBREAK = 2 ,
  UTF8PROC_DECOMP_TYPE_INITIAL = 3 ,
  UTF8PROC_DECOMP_TYPE_MEDIAL = 4 ,
  UTF8PROC_DECOMP_TYPE_FINAL = 5 ,
  UTF8PROC_DECOMP_TYPE_ISOLATED = 6 ,
  UTF8PROC_DECOMP_TYPE_CIRCLE = 7 ,
  UTF8PROC_DECOMP_TYPE_SUPER = 8 ,
  UTF8PROC_DECOMP_TYPE_SUB = 9 ,
  UTF8PROC_DECOMP_TYPE_VERTICAL = 10 ,
  UTF8PROC_DECOMP_TYPE_WIDE = 11 ,
  UTF8PROC_DECOMP_TYPE_NARROW = 12 ,
  UTF8PROC_DECOMP_TYPE_SMALL = 13 ,
  UTF8PROC_DECOMP_TYPE_SQUARE = 14 ,
  UTF8PROC_DECOMP_TYPE_FRACTION = 15 ,
  UTF8PROC_DECOMP_TYPE_COMPAT = 16
}
 
enum  utf8proc_boundclass_t {
  UTF8PROC_BOUNDCLASS_START = 0 ,
  UTF8PROC_BOUNDCLASS_OTHER = 1 ,
  UTF8PROC_BOUNDCLASS_CR = 2 ,
  UTF8PROC_BOUNDCLASS_LF = 3 ,
  UTF8PROC_BOUNDCLASS_CONTROL = 4 ,
  UTF8PROC_BOUNDCLASS_EXTEND = 5 ,
  UTF8PROC_BOUNDCLASS_L = 6 ,
  UTF8PROC_BOUNDCLASS_V = 7 ,
  UTF8PROC_BOUNDCLASS_T = 8 ,
  UTF8PROC_BOUNDCLASS_LV = 9 ,
  UTF8PROC_BOUNDCLASS_LVT = 10 ,
  UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR = 11 ,
  UTF8PROC_BOUNDCLASS_SPACINGMARK = 12 ,
  UTF8PROC_BOUNDCLASS_PREPEND = 13 ,
  UTF8PROC_BOUNDCLASS_ZWJ = 14 ,
  UTF8PROC_BOUNDCLASS_E_BASE = 15 ,
  UTF8PROC_BOUNDCLASS_E_MODIFIER = 16 ,
  UTF8PROC_BOUNDCLASS_GLUE_AFTER_ZWJ = 17 ,
  UTF8PROC_BOUNDCLASS_E_BASE_GAZ = 18 ,
  UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC = 19 ,
  UTF8PROC_BOUNDCLASS_E_ZWG = 20
}
 
enum  utf8proc_indic_conjunct_break_t {
  UTF8PROC_INDIC_CONJUNCT_BREAK_NONE = 0 ,
  UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER = 1 ,
  UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT = 2 ,
  UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND = 3
}
 

Functions

const char * utf8proc_version (void)
 
const char * utf8proc_unicode_version (void)
 
const char * utf8proc_errmsg (utf8proc_ssize_t errcode)
 
utf8proc_ssize_t utf8proc_iterate (const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *codepoint_ref)
 
utf8proc_bool utf8proc_codepoint_valid (utf8proc_int32_t codepoint)
 
utf8proc_ssize_t utf8proc_encode_char (utf8proc_int32_t codepoint, utf8proc_uint8_t *dst)
 
const utf8proc_property_tutf8proc_get_property (utf8proc_int32_t codepoint)
 
utf8proc_ssize_t utf8proc_decompose_char (utf8proc_int32_t codepoint, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass)
 
utf8proc_ssize_t utf8proc_decompose (const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options)
 
utf8proc_ssize_t utf8proc_decompose_custom (const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options, utf8proc_custom_func custom_func, void *custom_data)
 
utf8proc_ssize_t utf8proc_normalize_utf32 (utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options)
 
utf8proc_ssize_t utf8proc_reencode (utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options)
 
utf8proc_bool utf8proc_grapheme_break_stateful (utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2, utf8proc_int32_t *state)
 
utf8proc_bool utf8proc_grapheme_break (utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2)
 
utf8proc_int32_t utf8proc_tolower (utf8proc_int32_t c)
 
utf8proc_int32_t utf8proc_toupper (utf8proc_int32_t c)
 
utf8proc_int32_t utf8proc_totitle (utf8proc_int32_t c)
 
int utf8proc_islower (utf8proc_int32_t c)
 
int utf8proc_isupper (utf8proc_int32_t c)
 
int utf8proc_charwidth (utf8proc_int32_t codepoint)
 
utf8proc_bool utf8proc_charwidth_ambiguous (utf8proc_int32_t codepoint)
 
utf8proc_category_t utf8proc_category (utf8proc_int32_t codepoint)
 
const char * utf8proc_category_string (utf8proc_int32_t codepoint)
 
utf8proc_ssize_t utf8proc_map (const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options)
 
utf8proc_ssize_t utf8proc_map_custom (const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options, utf8proc_custom_func custom_func, void *custom_data)
 
Unicode normalization

Returns a pointer to newly allocated memory of a NFD, NFC, NFKD, NFKC or NFKC_Casefold normalized version of the null-terminated string str. These are shortcuts to calling utf8proc_map() with UTF8PROC_NULLTERM combined with UTF8PROC_STABLE and flags indicating the normalization.

utf8proc_uint8_t * utf8proc_NFD (const utf8proc_uint8_t *str)
 
utf8proc_uint8_t * utf8proc_NFC (const utf8proc_uint8_t *str)
 
utf8proc_uint8_t * utf8proc_NFKD (const utf8proc_uint8_t *str)
 
utf8proc_uint8_t * utf8proc_NFKC (const utf8proc_uint8_t *str)
 
utf8proc_uint8_t * utf8proc_NFKC_Casefold (const utf8proc_uint8_t *str)
 

Variables

const utf8proc_int8_t utf8proc_utf8class [256]
 

Macro Definition Documentation

◆ UTF8PROC_ERROR_INVALIDOPTS

#define UTF8PROC_ERROR_INVALIDOPTS   -5

Invalid options have been used.

◆ UTF8PROC_ERROR_INVALIDUTF8

#define UTF8PROC_ERROR_INVALIDUTF8   -3

The given string is not a legal UTF-8 string.

◆ UTF8PROC_ERROR_NOMEM

#define UTF8PROC_ERROR_NOMEM   -1

Memory could not be allocated.

◆ UTF8PROC_ERROR_NOTASSIGNED

#define UTF8PROC_ERROR_NOTASSIGNED   -4

The UTF8PROC_REJECTNA flag was set and an unassigned codepoint was found.

◆ UTF8PROC_ERROR_OVERFLOW

#define UTF8PROC_ERROR_OVERFLOW   -2

The given string is too long to be processed.

◆ UTF8PROC_VERSION_MAJOR

#define UTF8PROC_VERSION_MAJOR   2

The MAJOR version number (increased when backwards API compatibility is broken).

◆ UTF8PROC_VERSION_MINOR

#define UTF8PROC_VERSION_MINOR   10

The MINOR version number (increased when new functionality is added in a backwards-compatible manner).

◆ UTF8PROC_VERSION_PATCH

#define UTF8PROC_VERSION_PATCH   0

The PATCH version (increased for fixes that do not change the API).

Typedef Documentation

◆ utf8proc_custom_func

typedef utf8proc_int32_t(* utf8proc_custom_func) (utf8proc_int32_t codepoint, void *data)

Function pointer type passed to utf8proc_map_custom() and utf8proc_decompose_custom(), which is used to specify a user-defined mapping of codepoints to be applied in conjunction with other mappings.

◆ utf8proc_property_t

Struct containing information about a codepoint.

◆ utf8proc_propval_t

typedef utf8proc_int16_t utf8proc_propval_t

Holds the value of a property.

Enumeration Type Documentation

◆ utf8proc_bidi_class_t

Bidirectional character classes.

Enumerator
UTF8PROC_BIDI_CLASS_L 

Left-to-Right

UTF8PROC_BIDI_CLASS_LRE 

Left-to-Right Embedding

UTF8PROC_BIDI_CLASS_LRO 

Left-to-Right Override

UTF8PROC_BIDI_CLASS_R 

Right-to-Left

UTF8PROC_BIDI_CLASS_AL 

Right-to-Left Arabic

UTF8PROC_BIDI_CLASS_RLE 

Right-to-Left Embedding

UTF8PROC_BIDI_CLASS_RLO 

Right-to-Left Override

UTF8PROC_BIDI_CLASS_PDF 

Pop Directional Format

UTF8PROC_BIDI_CLASS_EN 

European Number

UTF8PROC_BIDI_CLASS_ES 

European Separator

UTF8PROC_BIDI_CLASS_ET 

European Number Terminator

UTF8PROC_BIDI_CLASS_AN 

Arabic Number

UTF8PROC_BIDI_CLASS_CS 

Common Number Separator

UTF8PROC_BIDI_CLASS_NSM 

Nonspacing Mark

UTF8PROC_BIDI_CLASS_BN 

Boundary Neutral

UTF8PROC_BIDI_CLASS_B 

Paragraph Separator

UTF8PROC_BIDI_CLASS_S 

Segment Separator

UTF8PROC_BIDI_CLASS_WS 

Whitespace

UTF8PROC_BIDI_CLASS_ON 

Other Neutrals

UTF8PROC_BIDI_CLASS_LRI 

Left-to-Right Isolate

UTF8PROC_BIDI_CLASS_RLI 

Right-to-Left Isolate

UTF8PROC_BIDI_CLASS_FSI 

First Strong Isolate

UTF8PROC_BIDI_CLASS_PDI 

Pop Directional Isolate

◆ utf8proc_boundclass_t

Boundclass property. (TR29)

Enumerator
UTF8PROC_BOUNDCLASS_START 

Start

UTF8PROC_BOUNDCLASS_OTHER 

Other

UTF8PROC_BOUNDCLASS_CR 

Cr

UTF8PROC_BOUNDCLASS_LF 

Lf

UTF8PROC_BOUNDCLASS_CONTROL 

Control

UTF8PROC_BOUNDCLASS_EXTEND 

Extend

UTF8PROC_BOUNDCLASS_L 

L

UTF8PROC_BOUNDCLASS_V 

V

UTF8PROC_BOUNDCLASS_T 

T

UTF8PROC_BOUNDCLASS_LV 

Lv

UTF8PROC_BOUNDCLASS_LVT 

Lvt

UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR 

Regional indicator

UTF8PROC_BOUNDCLASS_SPACINGMARK 

Spacingmark

UTF8PROC_BOUNDCLASS_PREPEND 

Prepend

UTF8PROC_BOUNDCLASS_ZWJ 

Zero Width Joiner

UTF8PROC_BOUNDCLASS_E_BASE 

Emoji Base

UTF8PROC_BOUNDCLASS_E_MODIFIER 

Emoji Modifier

UTF8PROC_BOUNDCLASS_GLUE_AFTER_ZWJ 

Glue_After_ZWJ

UTF8PROC_BOUNDCLASS_E_BASE_GAZ 

E_BASE + GLUE_AFTER_ZJW

◆ utf8proc_category_t

Unicode categories.

Enumerator
UTF8PROC_CATEGORY_CN 

Other, not assigned

UTF8PROC_CATEGORY_LU 

Letter, uppercase

UTF8PROC_CATEGORY_LL 

Letter, lowercase

UTF8PROC_CATEGORY_LT 

Letter, titlecase

UTF8PROC_CATEGORY_LM 

Letter, modifier

UTF8PROC_CATEGORY_LO 

Letter, other

UTF8PROC_CATEGORY_MN 

Mark, nonspacing

UTF8PROC_CATEGORY_MC 

Mark, spacing combining

UTF8PROC_CATEGORY_ME 

Mark, enclosing

UTF8PROC_CATEGORY_ND 

Number, decimal digit

UTF8PROC_CATEGORY_NL 

Number, letter

UTF8PROC_CATEGORY_NO 

Number, other

UTF8PROC_CATEGORY_PC 

Punctuation, connector

UTF8PROC_CATEGORY_PD 

Punctuation, dash

UTF8PROC_CATEGORY_PS 

Punctuation, open

UTF8PROC_CATEGORY_PE 

Punctuation, close

UTF8PROC_CATEGORY_PI 

Punctuation, initial quote

UTF8PROC_CATEGORY_PF 

Punctuation, final quote

UTF8PROC_CATEGORY_PO 

Punctuation, other

UTF8PROC_CATEGORY_SM 

Symbol, math

UTF8PROC_CATEGORY_SC 

Symbol, currency

UTF8PROC_CATEGORY_SK 

Symbol, modifier

UTF8PROC_CATEGORY_SO 

Symbol, other

UTF8PROC_CATEGORY_ZS 

Separator, space

UTF8PROC_CATEGORY_ZL 

Separator, line

UTF8PROC_CATEGORY_ZP 

Separator, paragraph

UTF8PROC_CATEGORY_CC 

Other, control

UTF8PROC_CATEGORY_CF 

Other, format

UTF8PROC_CATEGORY_CS 

Other, surrogate

UTF8PROC_CATEGORY_CO 

Other, private use

◆ utf8proc_decomp_type_t

Decomposition type.

Enumerator
UTF8PROC_DECOMP_TYPE_FONT 

Font

UTF8PROC_DECOMP_TYPE_NOBREAK 

Nobreak

UTF8PROC_DECOMP_TYPE_INITIAL 

Initial

UTF8PROC_DECOMP_TYPE_MEDIAL 

Medial

UTF8PROC_DECOMP_TYPE_FINAL 

Final

UTF8PROC_DECOMP_TYPE_ISOLATED 

Isolated

UTF8PROC_DECOMP_TYPE_CIRCLE 

Circle

UTF8PROC_DECOMP_TYPE_SUPER 

Super

UTF8PROC_DECOMP_TYPE_SUB 

Sub

UTF8PROC_DECOMP_TYPE_VERTICAL 

Vertical

UTF8PROC_DECOMP_TYPE_WIDE 

Wide

UTF8PROC_DECOMP_TYPE_NARROW 

Narrow

UTF8PROC_DECOMP_TYPE_SMALL 

Small

UTF8PROC_DECOMP_TYPE_SQUARE 

Square

UTF8PROC_DECOMP_TYPE_FRACTION 

Fraction

UTF8PROC_DECOMP_TYPE_COMPAT 

Compat

◆ utf8proc_indic_conjunct_break_t

Indic_Conjunct_Break property. (TR44)

◆ utf8proc_option_t

Option flags used by several functions in the library.

Enumerator
UTF8PROC_NULLTERM 

The given UTF-8 input is NULL terminated.

UTF8PROC_STABLE 

Unicode Versioning Stability has to be respected.

UTF8PROC_COMPAT 

Compatibility decomposition (i.e. formatting information is lost).

UTF8PROC_COMPOSE 

Return a result with decomposed characters.

UTF8PROC_DECOMPOSE 

Return a result with decomposed characters.

UTF8PROC_IGNORE 

Strip "default ignorable characters" such as SOFT-HYPHEN or ZERO-WIDTH-SPACE.

UTF8PROC_REJECTNA 

Return an error, if the input contains unassigned codepoints.

UTF8PROC_NLF2LS 

Indicating that NLF-sequences (LF, CRLF, CR, NEL) are representing a line break, and should be converted to the codepoint for line separation (LS).

UTF8PROC_NLF2PS 

Indicating that NLF-sequences are representing a paragraph break, and should be converted to the codepoint for paragraph separation (PS).

UTF8PROC_NLF2LF 

Indicating that the meaning of NLF-sequences is unknown.

UTF8PROC_STRIPCC 

Strips and/or convers control characters.

NLF-sequences are transformed into space, except if one of the NLF2LS/PS/LF options is given. HorizontalTab (HT) and FormFeed (FF) are treated as a NLF-sequence in this case. All other control characters are simply removed.

UTF8PROC_CASEFOLD 

Performs unicode case folding, to be able to do a case-insensitive string comparison.

UTF8PROC_CHARBOUND 

Inserts 0xFF bytes at the beginning of each sequence which is representing a single grapheme cluster (see UAX#29).

UTF8PROC_LUMP 

Lumps certain characters together.

E.g. HYPHEN U+2010 and MINUS U+2212 to ASCII "-". See lump.md for details.

If NLF2LF is set, this includes a transformation of paragraph and line separators to ASCII line-feed (LF).

UTF8PROC_STRIPMARK 

Strips all character markings.

This includes non-spacing, spacing and enclosing (i.e. accents).

Note
This option works only with UTF8PROC_COMPOSE or UTF8PROC_DECOMPOSE
UTF8PROC_STRIPNA 

Strip unassigned codepoints.

Function Documentation

◆ utf8proc_category()

utf8proc_category_t utf8proc_category ( utf8proc_int32_t codepoint)

Return the Unicode category for the codepoint (one of the utf8proc_category_t constants.)

◆ utf8proc_category_string()

const char * utf8proc_category_string ( utf8proc_int32_t codepoint)

Return the two-letter (nul-terminated) Unicode category string for the codepoint (e.g. "Lu" or "Co").

◆ utf8proc_charwidth()

int utf8proc_charwidth ( utf8proc_int32_t codepoint)

Given a codepoint, return a character width analogous to wcwidth(codepoint), except that a width of 0 is returned for non-printable codepoints instead of -1 as in wcwidth.

Note
If you want to check for particular types of non-printable characters, (analogous to isprint or iscntrl), use utf8proc_category().

◆ utf8proc_charwidth_ambiguous()

utf8proc_bool utf8proc_charwidth_ambiguous ( utf8proc_int32_t codepoint)

Given a codepoint, return whether it has East Asian width class A (Ambiguous)

Codepoints with this property are considered to have charwidth 1 (if they are printable) but some East Asian fonts render them as double width.

◆ utf8proc_codepoint_valid()

utf8proc_bool utf8proc_codepoint_valid ( utf8proc_int32_t codepoint)

Check if a codepoint is valid (regardless of whether it has been assigned a value by the current Unicode standard).

Returns
1 if the given codepoint is valid and otherwise return 0.

◆ utf8proc_decompose()

utf8proc_ssize_t utf8proc_decompose ( const utf8proc_uint8_t * str,
utf8proc_ssize_t strlen,
utf8proc_int32_t * buffer,
utf8proc_ssize_t bufsize,
utf8proc_option_t options )

The same as utf8proc_decompose_char(), but acts on a whole UTF-8 string and orders the decomposed sequences correctly.

If the UTF8PROC_NULLTERM flag in options is set, processing will be stopped, when a NULL byte is encountered, otherwise strlen bytes are processed. The result (in the form of 32-bit unicode codepoints) is written into the buffer being pointed to by buffer (which must contain at least bufsize entries). In case of success, the number of codepoints written is returned; in case of an error, a negative error code is returned (utf8proc_errmsg()). See utf8proc_decompose_custom() to supply additional transformations.

If the number of written codepoints would be bigger than bufsize, the required buffer size is returned, while the buffer will be overwritten with undefined data.

◆ utf8proc_decompose_char()

utf8proc_ssize_t utf8proc_decompose_char ( utf8proc_int32_t codepoint,
utf8proc_int32_t * dst,
utf8proc_ssize_t bufsize,
utf8proc_option_t options,
int * last_boundclass )

Decompose a codepoint into an array of codepoints.

Parameters
codepointthe codepoint.
dstthe destination buffer.
bufsizethe size of the destination buffer.
optionsone or more of the following flags:
last_boundclassPointer to an integer variable containing the previous codepoint's (boundclass + indic_conjunct_break << 1) if the UTF8PROC_CHARBOUND option is used. If the string is being processed in order, this can be initialized to 0 for the beginning of the string, and is thereafter updated automatically. Otherwise, this parameter is ignored.
Returns
In case of success, the number of codepoints written is returned; in case of an error, a negative error code is returned (utf8proc_errmsg()).
If the number of written codepoints would be bigger than bufsize, the required buffer size is returned, while the buffer will be overwritten with undefined data.

◆ utf8proc_decompose_custom()

utf8proc_ssize_t utf8proc_decompose_custom ( const utf8proc_uint8_t * str,
utf8proc_ssize_t strlen,
utf8proc_int32_t * buffer,
utf8proc_ssize_t bufsize,
utf8proc_option_t options,
utf8proc_custom_func custom_func,
void * custom_data )

The same as utf8proc_decompose(), but also takes a custom_func mapping function that is called on each codepoint in str before any other transformations (along with a custom_data pointer that is passed through to custom_func). The custom_func argument is ignored if it is NULL. See also utf8proc_map_custom().

◆ utf8proc_encode_char()

utf8proc_ssize_t utf8proc_encode_char ( utf8proc_int32_t codepoint,
utf8proc_uint8_t * dst )

Encodes the codepoint as an UTF-8 string in the byte array pointed to by dst. This array must be at least 4 bytes long.

In case of success the number of bytes written is returned, and otherwise 0 is returned.

This function does not check whether codepoint is valid Unicode.

◆ utf8proc_errmsg()

const char * utf8proc_errmsg ( utf8proc_ssize_t errcode)

Returns an informative error string for the given utf8proc error code (e.g. the error codes returned by utf8proc_map()).

◆ utf8proc_get_property()

const utf8proc_property_t * utf8proc_get_property ( utf8proc_int32_t codepoint)

Look up the properties for a given codepoint.

Parameters
codepointThe Unicode codepoint.
Returns
A pointer to a (constant) struct containing information about the codepoint.
If the codepoint is unassigned or invalid, a pointer to a special struct is returned in which category is 0 (UTF8PROC_CATEGORY_CN).

◆ utf8proc_grapheme_break()

utf8proc_bool utf8proc_grapheme_break ( utf8proc_int32_t codepoint1,
utf8proc_int32_t codepoint2 )

Same as utf8proc_grapheme_break_stateful(), except without support for the Unicode 9 additions to the algorithm. Supported for legacy reasons.

◆ utf8proc_grapheme_break_stateful()

utf8proc_bool utf8proc_grapheme_break_stateful ( utf8proc_int32_t codepoint1,
utf8proc_int32_t codepoint2,
utf8proc_int32_t * state )

Given a pair of consecutive codepoints, return whether a grapheme break is permitted between them (as defined by the extended grapheme clusters in UAX#29).

Parameters
codepoint1The first codepoint.
codepoint2The second codepoint, occurring consecutively after codepoint1.
stateBeginning with Version 29 (Unicode 9.0.0), this algorithm requires state to break graphemes. This state can be passed in as a pointer in the state argument and should initially be set to 0. If the state is not passed in (i.e. a null pointer is passed), UAX#29 rules GB10/12/13 which require this state will not be applied, essentially matching the rules in Unicode 8.0.0.
Warning
If the state parameter is used, utf8proc_grapheme_break_stateful must be called IN ORDER on ALL potential breaks in a string. However, it is safe to reset the state to zero after a grapheme break.

◆ utf8proc_islower()

int utf8proc_islower ( utf8proc_int32_t c)

Given a codepoint c, return 1 if the codepoint corresponds to a lower-case character and 0 otherwise.

◆ utf8proc_isupper()

int utf8proc_isupper ( utf8proc_int32_t c)

Given a codepoint c, return 1 if the codepoint corresponds to an upper-case character and 0 otherwise.

◆ utf8proc_iterate()

utf8proc_ssize_t utf8proc_iterate ( const utf8proc_uint8_t * str,
utf8proc_ssize_t strlen,
utf8proc_int32_t * codepoint_ref )

Reads a single codepoint from the UTF-8 sequence being pointed to by str. The maximum number of bytes read is strlen, unless strlen is negative (in which case up to 4 bytes are read).

If a valid codepoint could be read, it is stored in the variable pointed to by codepoint_ref, otherwise that variable will be set to -1. In case of success, the number of bytes read is returned; otherwise, a negative error code is returned.

◆ utf8proc_map()

utf8proc_ssize_t utf8proc_map ( const utf8proc_uint8_t * str,
utf8proc_ssize_t strlen,
utf8proc_uint8_t ** dstptr,
utf8proc_option_t options )

Maps the given UTF-8 string pointed to by str to a new UTF-8 string, allocated dynamically by malloc and returned via dstptr.

If the UTF8PROC_NULLTERM flag in the options field is set, the length is determined by a NULL terminator, otherwise the parameter strlen is evaluated to determine the string length, but in any case the result will be NULL terminated (though it might contain NULL characters with the string if str contained NULL characters). Other flags in the options field are passed to the functions defined above, and regarded as described. See also utf8proc_map_custom() to supply a custom codepoint transformation.

In case of success the length of the new string is returned, otherwise a negative error code is returned.

Note
The memory of the new UTF-8 string will have been allocated with malloc, and should therefore be deallocated with free.

◆ utf8proc_map_custom()

utf8proc_ssize_t utf8proc_map_custom ( const utf8proc_uint8_t * str,
utf8proc_ssize_t strlen,
utf8proc_uint8_t ** dstptr,
utf8proc_option_t options,
utf8proc_custom_func custom_func,
void * custom_data )

Like utf8proc_map(), but also takes a custom_func mapping function that is called on each codepoint in str before any other transformations (along with a custom_data pointer that is passed through to custom_func). The custom_func argument is ignored if it is NULL.

◆ utf8proc_NFC()

utf8proc_uint8_t * utf8proc_NFC ( const utf8proc_uint8_t * str)

NFC normalization (UTF8PROC_COMPOSE).

◆ utf8proc_NFD()

utf8proc_uint8_t * utf8proc_NFD ( const utf8proc_uint8_t * str)

NFD normalization (UTF8PROC_DECOMPOSE).

◆ utf8proc_NFKC()

utf8proc_uint8_t * utf8proc_NFKC ( const utf8proc_uint8_t * str)

NFKC normalization (UTF8PROC_COMPOSE and UTF8PROC_COMPAT).

◆ utf8proc_NFKC_Casefold()

utf8proc_uint8_t * utf8proc_NFKC_Casefold ( const utf8proc_uint8_t * str)

NFKC_Casefold normalization (UTF8PROC_COMPOSE and UTF8PROC_COMPAT and UTF8PROC_CASEFOLD and UTF8PROC_IGNORE).

◆ utf8proc_NFKD()

utf8proc_uint8_t * utf8proc_NFKD ( const utf8proc_uint8_t * str)

NFKD normalization (UTF8PROC_DECOMPOSE and UTF8PROC_COMPAT).

◆ utf8proc_normalize_utf32()

utf8proc_ssize_t utf8proc_normalize_utf32 ( utf8proc_int32_t * buffer,
utf8proc_ssize_t length,
utf8proc_option_t options )

Normalizes the sequence of length codepoints pointed to by buffer in-place (i.e., the result is also stored in buffer).

Parameters
bufferthe (native-endian UTF-32) unicode codepoints to re-encode.
lengththe length (in codepoints) of the buffer.
optionsa bitwise or (|) of one or more of the following flags:
Returns
In case of success, the length (in codepoints) of the normalized UTF-32 string is returned; otherwise, a negative error code is returned (utf8proc_errmsg()).
Warning
The entries of the array pointed to by str have to be in the range 0x0000 to 0x10FFFF. Otherwise, the program might crash!

◆ utf8proc_reencode()

utf8proc_ssize_t utf8proc_reencode ( utf8proc_int32_t * buffer,
utf8proc_ssize_t length,
utf8proc_option_t options )

Reencodes the sequence of length codepoints pointed to by buffer UTF-8 data in-place (i.e., the result is also stored in buffer). Can optionally normalize the UTF-32 sequence prior to UTF-8 conversion.

Parameters
bufferthe (native-endian UTF-32) unicode codepoints to re-encode.
lengththe length (in codepoints) of the buffer.
optionsa bitwise or (|) of one or more of the following flags:
Returns
In case of success, the length (in bytes) of the resulting nul-terminated UTF-8 string is returned; otherwise, a negative error code is returned (utf8proc_errmsg()).
Warning
The amount of free space pointed to by buffer must exceed the amount of the input data by one byte, and the entries of the array pointed to by str have to be in the range 0x0000 to 0x10FFFF. Otherwise, the program might crash!

◆ utf8proc_tolower()

utf8proc_int32_t utf8proc_tolower ( utf8proc_int32_t c)

Given a codepoint c, return the codepoint of the corresponding lower-case character, if any; otherwise (if there is no lower-case variant, or if c is not a valid codepoint) return c.

◆ utf8proc_totitle()

utf8proc_int32_t utf8proc_totitle ( utf8proc_int32_t c)

Given a codepoint c, return the codepoint of the corresponding title-case character, if any; otherwise (if there is no title-case variant, or if c is not a valid codepoint) return c.

◆ utf8proc_toupper()

utf8proc_int32_t utf8proc_toupper ( utf8proc_int32_t c)

Given a codepoint c, return the codepoint of the corresponding upper-case character, if any; otherwise (if there is no upper-case variant, or if c is not a valid codepoint) return c.

◆ utf8proc_unicode_version()

const char * utf8proc_unicode_version ( void )

Returns the utf8proc supported Unicode version as a string MAJOR.MINOR.PATCH.

◆ utf8proc_version()

const char * utf8proc_version ( void )

Returns the utf8proc API version as a string MAJOR.MINOR.PATCH (http://semver.org format), possibly with a "-dev" suffix for development versions.

Variable Documentation

◆ utf8proc_utf8class

const utf8proc_int8_t utf8proc_utf8class[256]
extern

Array containing the byte lengths of a UTF-8 encoded codepoint based on the first byte.