#include <stdlib.h>
#include <stddef.h>
#include <stdbool.h>
#include <stdint.h>
#include <limits.h>

Data Structures
struct	utf8proc_property_struct

Macros
#define	UTF8PROC_DLLEXPORT
API version
The utf8proc API version MAJOR.MINOR.PATCH, following semantic-versioning rules (http://semver.org) based on API compatibility. This is also returned at runtime by utf8proc_version(); however, the runtime version may append a string like "-dev" to the version number for prerelease versions. Note The shared-library version number in the Makefile (and CMakeLists.txt, and MANIFEST) may be different, being based on ABI compatibility rather than API compatibility.
#define	UTF8PROC_VERSION_MAJOR 2
#define	UTF8PROC_VERSION_MINOR 11
#define	UTF8PROC_VERSION_PATCH 2
Error codes
Error codes being returned by almost all functions.
#define	UTF8PROC_ERROR_NOMEM -1
#define	UTF8PROC_ERROR_OVERFLOW -2
#define	UTF8PROC_ERROR_INVALIDUTF8 -3
#define	UTF8PROC_ERROR_NOTASSIGNED -4
#define	UTF8PROC_ERROR_INVALIDOPTS -5

Typedefs
typedef int8_t	utf8proc_int8_t
typedef uint8_t	utf8proc_uint8_t
typedef int16_t	utf8proc_int16_t
typedef uint16_t	utf8proc_uint16_t
typedef int32_t	utf8proc_int32_t
typedef uint32_t	utf8proc_uint32_t
typedef size_t	utf8proc_size_t
typedef ptrdiff_t	utf8proc_ssize_t
typedef bool	utf8proc_bool
typedef utf8proc_int16_t	utf8proc_propval_t
typedef struct utf8proc_property_struct	utf8proc_property_t
typedef utf8proc_int32_t(*	utf8proc_custom_func) (utf8proc_int32_t codepoint, void *data)

Enumerations
enum	utf8proc_option_t { UTF8PROC_NULLTERM = (1<<0) , UTF8PROC_STABLE = (1<<1) , UTF8PROC_COMPAT = (1<<2) , UTF8PROC_COMPOSE = (1<<3) , UTF8PROC_DECOMPOSE = (1<<4) , UTF8PROC_IGNORE = (1<<5) , UTF8PROC_REJECTNA = (1<<6) , UTF8PROC_NLF2LS = (1<<7) , UTF8PROC_NLF2PS = (1<<8) , UTF8PROC_NLF2LF = (UTF8PROC_NLF2LS \| UTF8PROC_NLF2PS) , UTF8PROC_STRIPCC = (1<<9) , UTF8PROC_CASEFOLD = (1<<10) , UTF8PROC_CHARBOUND = (1<<11) , UTF8PROC_LUMP = (1<<12) , UTF8PROC_STRIPMARK = (1<<13) , UTF8PROC_STRIPNA = (1<<14) }
enum	utf8proc_category_t { UTF8PROC_CATEGORY_CN = 0 , UTF8PROC_CATEGORY_LU = 1 , UTF8PROC_CATEGORY_LL = 2 , UTF8PROC_CATEGORY_LT = 3 , UTF8PROC_CATEGORY_LM = 4 , UTF8PROC_CATEGORY_LO = 5 , UTF8PROC_CATEGORY_MN = 6 , UTF8PROC_CATEGORY_MC = 7 , UTF8PROC_CATEGORY_ME = 8 , UTF8PROC_CATEGORY_ND = 9 , UTF8PROC_CATEGORY_NL = 10 , UTF8PROC_CATEGORY_NO = 11 , UTF8PROC_CATEGORY_PC = 12 , UTF8PROC_CATEGORY_PD = 13 , UTF8PROC_CATEGORY_PS = 14 , UTF8PROC_CATEGORY_PE = 15 , UTF8PROC_CATEGORY_PI = 16 , UTF8PROC_CATEGORY_PF = 17 , UTF8PROC_CATEGORY_PO = 18 , UTF8PROC_CATEGORY_SM = 19 , UTF8PROC_CATEGORY_SC = 20 , UTF8PROC_CATEGORY_SK = 21 , UTF8PROC_CATEGORY_SO = 22 , UTF8PROC_CATEGORY_ZS = 23 , UTF8PROC_CATEGORY_ZL = 24 , UTF8PROC_CATEGORY_ZP = 25 , UTF8PROC_CATEGORY_CC = 26 , UTF8PROC_CATEGORY_CF = 27 , UTF8PROC_CATEGORY_CS = 28 , UTF8PROC_CATEGORY_CO = 29 }
enum	utf8proc_bidi_class_t { UTF8PROC_BIDI_CLASS_L = 1 , UTF8PROC_BIDI_CLASS_LRE = 2 , UTF8PROC_BIDI_CLASS_LRO = 3 , UTF8PROC_BIDI_CLASS_R = 4 , UTF8PROC_BIDI_CLASS_AL = 5 , UTF8PROC_BIDI_CLASS_RLE = 6 , UTF8PROC_BIDI_CLASS_RLO = 7 , UTF8PROC_BIDI_CLASS_PDF = 8 , UTF8PROC_BIDI_CLASS_EN = 9 , UTF8PROC_BIDI_CLASS_ES = 10 , UTF8PROC_BIDI_CLASS_ET = 11 , UTF8PROC_BIDI_CLASS_AN = 12 , UTF8PROC_BIDI_CLASS_CS = 13 , UTF8PROC_BIDI_CLASS_NSM = 14 , UTF8PROC_BIDI_CLASS_BN = 15 , UTF8PROC_BIDI_CLASS_B = 16 , UTF8PROC_BIDI_CLASS_S = 17 , UTF8PROC_BIDI_CLASS_WS = 18 , UTF8PROC_BIDI_CLASS_ON = 19 , UTF8PROC_BIDI_CLASS_LRI = 20 , UTF8PROC_BIDI_CLASS_RLI = 21 , UTF8PROC_BIDI_CLASS_FSI = 22 , UTF8PROC_BIDI_CLASS_PDI = 23 }
enum	utf8proc_decomp_type_t { UTF8PROC_DECOMP_TYPE_FONT = 1 , UTF8PROC_DECOMP_TYPE_NOBREAK = 2 , UTF8PROC_DECOMP_TYPE_INITIAL = 3 , UTF8PROC_DECOMP_TYPE_MEDIAL = 4 , UTF8PROC_DECOMP_TYPE_FINAL = 5 , UTF8PROC_DECOMP_TYPE_ISOLATED = 6 , UTF8PROC_DECOMP_TYPE_CIRCLE = 7 , UTF8PROC_DECOMP_TYPE_SUPER = 8 , UTF8PROC_DECOMP_TYPE_SUB = 9 , UTF8PROC_DECOMP_TYPE_VERTICAL = 10 , UTF8PROC_DECOMP_TYPE_WIDE = 11 , UTF8PROC_DECOMP_TYPE_NARROW = 12 , UTF8PROC_DECOMP_TYPE_SMALL = 13 , UTF8PROC_DECOMP_TYPE_SQUARE = 14 , UTF8PROC_DECOMP_TYPE_FRACTION = 15 , UTF8PROC_DECOMP_TYPE_COMPAT = 16 }
enum	utf8proc_boundclass_t { UTF8PROC_BOUNDCLASS_START = 0 , UTF8PROC_BOUNDCLASS_OTHER = 1 , UTF8PROC_BOUNDCLASS_CR = 2 , UTF8PROC_BOUNDCLASS_LF = 3 , UTF8PROC_BOUNDCLASS_CONTROL = 4 , UTF8PROC_BOUNDCLASS_EXTEND = 5 , UTF8PROC_BOUNDCLASS_L = 6 , UTF8PROC_BOUNDCLASS_V = 7 , UTF8PROC_BOUNDCLASS_T = 8 , UTF8PROC_BOUNDCLASS_LV = 9 , UTF8PROC_BOUNDCLASS_LVT = 10 , UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR = 11 , UTF8PROC_BOUNDCLASS_SPACINGMARK = 12 , UTF8PROC_BOUNDCLASS_PREPEND = 13 , UTF8PROC_BOUNDCLASS_ZWJ = 14 , UTF8PROC_BOUNDCLASS_E_BASE = 15 , UTF8PROC_BOUNDCLASS_E_MODIFIER = 16 , UTF8PROC_BOUNDCLASS_GLUE_AFTER_ZWJ = 17 , UTF8PROC_BOUNDCLASS_E_BASE_GAZ = 18 , UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC = 19 , UTF8PROC_BOUNDCLASS_E_ZWG = 20 }
enum	utf8proc_indic_conjunct_break_t { UTF8PROC_INDIC_CONJUNCT_BREAK_NONE = 0 , UTF8PROC_INDIC_CONJUNCT_BREAK_LINKER = 1 , UTF8PROC_INDIC_CONJUNCT_BREAK_CONSONANT = 2 , UTF8PROC_INDIC_CONJUNCT_BREAK_EXTEND = 3 }

Functions
const char *	utf8proc_version (void)
const char *	utf8proc_unicode_version (void)
const char *	utf8proc_errmsg (utf8proc_ssize_t errcode)
utf8proc_ssize_t	utf8proc_iterate (const utf8proc_uint8_t str, utf8proc_ssize_t strlen, utf8proc_int32_t codepoint_ref)
utf8proc_bool	utf8proc_codepoint_valid (utf8proc_int32_t codepoint)
utf8proc_ssize_t	utf8proc_encode_char (utf8proc_int32_t codepoint, utf8proc_uint8_t *dst)
const utf8proc_property_t *	utf8proc_get_property (utf8proc_int32_t codepoint)
utf8proc_ssize_t	utf8proc_decompose_char (utf8proc_int32_t codepoint, utf8proc_int32_t dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int last_boundclass)
utf8proc_ssize_t	utf8proc_decompose (const utf8proc_uint8_t str, utf8proc_ssize_t strlen, utf8proc_int32_t buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options)
utf8proc_ssize_t	utf8proc_decompose_custom (const utf8proc_uint8_t str, utf8proc_ssize_t strlen, utf8proc_int32_t buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options, utf8proc_custom_func custom_func, void *custom_data)
utf8proc_ssize_t	utf8proc_normalize_utf32 (utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options)
utf8proc_ssize_t	utf8proc_reencode (utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options)
utf8proc_bool	utf8proc_grapheme_break_stateful (utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2, utf8proc_int32_t *state)
utf8proc_bool	utf8proc_grapheme_break (utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2)
utf8proc_int32_t	utf8proc_tolower (utf8proc_int32_t c)
utf8proc_int32_t	utf8proc_toupper (utf8proc_int32_t c)
utf8proc_int32_t	utf8proc_totitle (utf8proc_int32_t c)
int	utf8proc_islower (utf8proc_int32_t c)
int	utf8proc_isupper (utf8proc_int32_t c)
int	utf8proc_charwidth (utf8proc_int32_t codepoint)
utf8proc_bool	utf8proc_charwidth_ambiguous (utf8proc_int32_t codepoint)
utf8proc_category_t	utf8proc_category (utf8proc_int32_t codepoint)
const char *	utf8proc_category_string (utf8proc_int32_t codepoint)
utf8proc_ssize_t	utf8proc_map (const utf8proc_uint8_t str, utf8proc_ssize_t strlen, utf8proc_uint8_t *dstptr, utf8proc_option_t options)
utf8proc_ssize_t	utf8proc_map_custom (const utf8proc_uint8_t str, utf8proc_ssize_t strlen, utf8proc_uint8_t dstptr, utf8proc_option_t options, utf8proc_custom_func custom_func, void custom_data)
Unicode normalization
Returns a pointer to newly allocated memory of a NFD, NFC, NFKD, NFKC or NFKC_Casefold normalized version of the null-terminated string str. These are shortcuts to calling utf8proc_map() with UTF8PROC_NULLTERM combined with UTF8PROC_STABLE and flags indicating the normalization.
utf8proc_uint8_t *	utf8proc_NFD (const utf8proc_uint8_t *str)
utf8proc_uint8_t *	utf8proc_NFC (const utf8proc_uint8_t *str)
utf8proc_uint8_t *	utf8proc_NFKD (const utf8proc_uint8_t *str)
utf8proc_uint8_t *	utf8proc_NFKC (const utf8proc_uint8_t *str)
utf8proc_uint8_t *	utf8proc_NFKC_Casefold (const utf8proc_uint8_t *str)

Variables
const utf8proc_int8_t	utf8proc_utf8class [256]

Macro Definition Documentation

◆ UTF8PROC_ERROR_INVALIDOPTS

#define UTF8PROC_ERROR_INVALIDOPTS -5

Invalid options have been used.

◆ UTF8PROC_ERROR_INVALIDUTF8

#define UTF8PROC_ERROR_INVALIDUTF8 -3

The given string is not a legal UTF-8 string.

◆ UTF8PROC_ERROR_NOMEM

#define UTF8PROC_ERROR_NOMEM -1

Memory could not be allocated.

◆ UTF8PROC_ERROR_NOTASSIGNED

#define UTF8PROC_ERROR_NOTASSIGNED -4

The UTF8PROC_REJECTNA flag was set and an unassigned codepoint was found.

◆ UTF8PROC_ERROR_OVERFLOW

#define UTF8PROC_ERROR_OVERFLOW -2

The given string is too long to be processed.

◆ UTF8PROC_VERSION_MAJOR

#define UTF8PROC_VERSION_MAJOR 2

The MAJOR version number (increased when backwards API compatibility is broken).

◆ UTF8PROC_VERSION_MINOR

#define UTF8PROC_VERSION_MINOR 11

The MINOR version number (increased when new functionality is added in a backwards-compatible manner).

◆ UTF8PROC_VERSION_PATCH

#define UTF8PROC_VERSION_PATCH 2

The PATCH version (increased for fixes that do not change the API).

Typedef Documentation

◆ utf8proc_custom_func

typedef utf8proc_int32_t(* utf8proc_custom_func) (utf8proc_int32_t codepoint, void *data)

Function pointer type passed to utf8proc_map_custom() and utf8proc_decompose_custom(), which is used to specify a user-defined mapping of codepoints to be applied in conjunction with other mappings.

◆ utf8proc_property_t

typedef struct utf8proc_property_struct utf8proc_property_t

Struct containing information about a codepoint.

◆ utf8proc_propval_t

typedef utf8proc_int16_t utf8proc_propval_t

Holds the value of a property.

Enumeration Type Documentation

◆ utf8proc_bidi_class_t

enum utf8proc_bidi_class_t

Bidirectional character classes.

Enumerator
UTF8PROC_BIDI_CLASS_L	Left-to-Right
UTF8PROC_BIDI_CLASS_LRE	Left-to-Right Embedding
UTF8PROC_BIDI_CLASS_LRO	Left-to-Right Override
UTF8PROC_BIDI_CLASS_R	Right-to-Left
UTF8PROC_BIDI_CLASS_AL	Right-to-Left Arabic
UTF8PROC_BIDI_CLASS_RLE	Right-to-Left Embedding
UTF8PROC_BIDI_CLASS_RLO	Right-to-Left Override
UTF8PROC_BIDI_CLASS_PDF	Pop Directional Format
UTF8PROC_BIDI_CLASS_EN	European Number
UTF8PROC_BIDI_CLASS_ES	European Separator
UTF8PROC_BIDI_CLASS_ET	European Number Terminator
UTF8PROC_BIDI_CLASS_AN	Arabic Number
UTF8PROC_BIDI_CLASS_CS	Common Number Separator
UTF8PROC_BIDI_CLASS_NSM	Nonspacing Mark
UTF8PROC_BIDI_CLASS_BN	Boundary Neutral
UTF8PROC_BIDI_CLASS_B	Paragraph Separator
UTF8PROC_BIDI_CLASS_S	Segment Separator
UTF8PROC_BIDI_CLASS_WS	Whitespace
UTF8PROC_BIDI_CLASS_ON	Other Neutrals
UTF8PROC_BIDI_CLASS_LRI	Left-to-Right Isolate
UTF8PROC_BIDI_CLASS_RLI	Right-to-Left Isolate
UTF8PROC_BIDI_CLASS_FSI	First Strong Isolate
UTF8PROC_BIDI_CLASS_PDI	Pop Directional Isolate

◆ utf8proc_boundclass_t

enum utf8proc_boundclass_t

Boundclass property. (TR29)

Enumerator
UTF8PROC_BOUNDCLASS_START	Start
UTF8PROC_BOUNDCLASS_OTHER	Other
UTF8PROC_BOUNDCLASS_CR	Cr
UTF8PROC_BOUNDCLASS_LF	Lf
UTF8PROC_BOUNDCLASS_CONTROL	Control
UTF8PROC_BOUNDCLASS_EXTEND	Extend
UTF8PROC_BOUNDCLASS_L	L
UTF8PROC_BOUNDCLASS_V	V
UTF8PROC_BOUNDCLASS_T	T
UTF8PROC_BOUNDCLASS_LV	Lv
UTF8PROC_BOUNDCLASS_LVT	Lvt
UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR	Regional indicator
UTF8PROC_BOUNDCLASS_SPACINGMARK	Spacingmark
UTF8PROC_BOUNDCLASS_PREPEND	Prepend
UTF8PROC_BOUNDCLASS_ZWJ	Zero Width Joiner
UTF8PROC_BOUNDCLASS_E_BASE	Emoji Base
UTF8PROC_BOUNDCLASS_E_MODIFIER	Emoji Modifier
UTF8PROC_BOUNDCLASS_GLUE_AFTER_ZWJ	Glue_After_ZWJ
UTF8PROC_BOUNDCLASS_E_BASE_GAZ	E_BASE + GLUE_AFTER_ZJW

◆ utf8proc_category_t

enum utf8proc_category_t

Unicode categories.

Enumerator
UTF8PROC_CATEGORY_CN	Other, not assigned
UTF8PROC_CATEGORY_LU	Letter, uppercase
UTF8PROC_CATEGORY_LL	Letter, lowercase
UTF8PROC_CATEGORY_LT	Letter, titlecase
UTF8PROC_CATEGORY_LM	Letter, modifier
UTF8PROC_CATEGORY_LO	Letter, other
UTF8PROC_CATEGORY_MN	Mark, nonspacing
UTF8PROC_CATEGORY_MC	Mark, spacing combining
UTF8PROC_CATEGORY_ME	Mark, enclosing
UTF8PROC_CATEGORY_ND	Number, decimal digit
UTF8PROC_CATEGORY_NL	Number, letter
UTF8PROC_CATEGORY_NO	Number, other
UTF8PROC_CATEGORY_PC	Punctuation, connector
UTF8PROC_CATEGORY_PD	Punctuation, dash
UTF8PROC_CATEGORY_PS	Punctuation, open
UTF8PROC_CATEGORY_PE	Punctuation, close
UTF8PROC_CATEGORY_PI	Punctuation, initial quote
UTF8PROC_CATEGORY_PF	Punctuation, final quote
UTF8PROC_CATEGORY_PO	Punctuation, other
UTF8PROC_CATEGORY_SM	Symbol, math
UTF8PROC_CATEGORY_SC	Symbol, currency
UTF8PROC_CATEGORY_SK	Symbol, modifier
UTF8PROC_CATEGORY_SO	Symbol, other
UTF8PROC_CATEGORY_ZS	Separator, space
UTF8PROC_CATEGORY_ZL	Separator, line
UTF8PROC_CATEGORY_ZP	Separator, paragraph
UTF8PROC_CATEGORY_CC	Other, control
UTF8PROC_CATEGORY_CF	Other, format
UTF8PROC_CATEGORY_CS	Other, surrogate
UTF8PROC_CATEGORY_CO	Other, private use

◆ utf8proc_decomp_type_t

enum utf8proc_decomp_type_t

Decomposition type.

Enumerator
UTF8PROC_DECOMP_TYPE_FONT	Font
UTF8PROC_DECOMP_TYPE_NOBREAK	Nobreak
UTF8PROC_DECOMP_TYPE_INITIAL	Initial
UTF8PROC_DECOMP_TYPE_MEDIAL	Medial
UTF8PROC_DECOMP_TYPE_FINAL	Final
UTF8PROC_DECOMP_TYPE_ISOLATED	Isolated
UTF8PROC_DECOMP_TYPE_CIRCLE	Circle
UTF8PROC_DECOMP_TYPE_SUPER	Super
UTF8PROC_DECOMP_TYPE_SUB	Sub
UTF8PROC_DECOMP_TYPE_VERTICAL	Vertical
UTF8PROC_DECOMP_TYPE_WIDE	Wide
UTF8PROC_DECOMP_TYPE_NARROW	Narrow
UTF8PROC_DECOMP_TYPE_SMALL	Small
UTF8PROC_DECOMP_TYPE_SQUARE	Square
UTF8PROC_DECOMP_TYPE_FRACTION	Fraction
UTF8PROC_DECOMP_TYPE_COMPAT	Compat

◆ utf8proc_indic_conjunct_break_t

enum utf8proc_indic_conjunct_break_t

Indic_Conjunct_Break property. (TR44)

◆ utf8proc_option_t

enum utf8proc_option_t

Option flags used by several functions in the library.

Enumerator
UTF8PROC_NULLTERM	The given UTF-8 input is NULL terminated.
UTF8PROC_STABLE	Unicode Versioning Stability has to be respected.
UTF8PROC_COMPAT	Compatibility decomposition (i.e. formatting information is lost).
UTF8PROC_COMPOSE	Return a result with composed characters.
UTF8PROC_DECOMPOSE	Return a result with decomposed characters.
UTF8PROC_IGNORE	Strip "default ignorable characters" such as SOFT-HYPHEN or ZERO-WIDTH-SPACE.
UTF8PROC_REJECTNA	Return an error, if the input contains unassigned codepoints.
UTF8PROC_NLF2LS	Indicating that NLF-sequences (LF, CRLF, CR, NEL) are representing a line break, and should be converted to the codepoint for line separation (LS).
UTF8PROC_NLF2PS	Indicating that NLF-sequences are representing a paragraph break, and should be converted to the codepoint for paragraph separation (PS).
UTF8PROC_NLF2LF	Indicating that the meaning of NLF-sequences is unknown.
UTF8PROC_STRIPCC	Strips and/or convers control characters. NLF-sequences are transformed into space, except if one of the NLF2LS/PS/LF options is given. HorizontalTab (HT) and FormFeed (FF) are treated as a NLF-sequence in this case. All other control characters are simply removed.
UTF8PROC_CASEFOLD	Performs unicode case folding, to be able to do a case-insensitive string comparison.
UTF8PROC_CHARBOUND	Inserts 0xFF bytes at the beginning of each sequence which is representing a single grapheme cluster (see UAX#29).
UTF8PROC_LUMP	Lumps certain characters together. E.g. HYPHEN U+2010 and MINUS U+2212 to ASCII "-". See lump.md for details. If NLF2LF is set, this includes a transformation of paragraph and line separators to ASCII line-feed (LF).
UTF8PROC_STRIPMARK	Strips all character markings. This includes non-spacing, spacing and enclosing (i.e. accents). Note This option works only with UTF8PROC_COMPOSE or UTF8PROC_DECOMPOSE
UTF8PROC_STRIPNA	Strip unassigned codepoints.

Function Documentation

◆ utf8proc_category()

utf8proc_category_t utf8proc_category ( utf8proc_int32_t codepoint )

Return the Unicode category for the codepoint (one of the utf8proc_category_t constants.)

◆ utf8proc_category_string()

const char * utf8proc_category_string ( utf8proc_int32_t codepoint )

Return the two-letter (nul-terminated) Unicode category string for the codepoint (e.g. "Lu" or "Co").

◆ utf8proc_charwidth()

int utf8proc_charwidth ( utf8proc_int32_t codepoint )

Given a codepoint, return a character width analogous to wcwidth(codepoint), except that a width of 0 is returned for non-printable codepoints instead of -1 as in wcwidth.

Note: If you want to check for particular types of non-printable characters, (analogous to isprint or iscntrl), use utf8proc_category().

◆ utf8proc_charwidth_ambiguous()

utf8proc_bool utf8proc_charwidth_ambiguous ( utf8proc_int32_t codepoint )

Given a codepoint, return whether it has East Asian width class A (Ambiguous)

Codepoints with this property are considered to have charwidth 1 (if they are printable) but some East Asian fonts render them as double width.

◆ utf8proc_codepoint_valid()

utf8proc_bool utf8proc_codepoint_valid ( utf8proc_int32_t codepoint )

Check if a codepoint is valid (regardless of whether it has been assigned a value by the current Unicode standard).

Returns: 1 if the given codepoint is valid and otherwise return 0.

◆ utf8proc_decompose()

utf8proc_ssize_t utf8proc_decompose	(	const utf8proc_uint8_t *	str,
		utf8proc_ssize_t	strlen,
		utf8proc_int32_t *	buffer,
		utf8proc_ssize_t	bufsize,
		utf8proc_option_t	options )

The same as utf8proc_decompose_char(), but acts on a whole UTF-8 string and orders the decomposed sequences correctly.

If the UTF8PROC_NULLTERM flag in options is set, processing will be stopped, when a NULL byte is encountered, otherwise strlen bytes are processed. The result (in the form of 32-bit unicode codepoints) is written into the buffer being pointed to by buffer (which must contain at least bufsize entries). In case of success, the number of codepoints written is returned; in case of an error, a negative error code is returned (utf8proc_errmsg()). See utf8proc_decompose_custom() to supply additional transformations.

If the number of written codepoints would be bigger than bufsize, the required buffer size is returned, while the buffer will be overwritten with undefined data.

◆ utf8proc_decompose_char()

utf8proc_ssize_t utf8proc_decompose_char	(	utf8proc_int32_t	codepoint,
		utf8proc_int32_t *	dst,
		utf8proc_ssize_t	bufsize,
		utf8proc_option_t	options,
		int *	last_boundclass )

Decompose a codepoint into an array of codepoints.

Parameters

codepoint	the codepoint.
dst	the destination buffer.
bufsize	the size of the destination buffer.
options	one or more of the following flags: UTF8PROC_REJECTNA - return an error if codepoint is unassigned UTF8PROC_IGNORE - strip "default ignorable" codepoints UTF8PROC_CASEFOLD - apply Unicode casefolding UTF8PROC_COMPAT - replace certain codepoints with their compatibility decomposition UTF8PROC_CHARBOUND - insert 0xFF bytes before each grapheme cluster UTF8PROC_LUMP - lump certain different codepoints together UTF8PROC_STRIPMARK - remove all character marks UTF8PROC_STRIPNA - remove unassigned codepoints
last_boundclass	Pointer to an integer variable containing the previous codepoint's (boundclass + indic_conjunct_break << 1) if the UTF8PROC_CHARBOUND option is used. If the string is being processed in order, this can be initialized to 0 for the beginning of the string, and is thereafter updated automatically. Otherwise, this parameter is ignored.

In the current version of utf8proc, the maximum destination buffer with the UTF8PROC_DECOMPOSE option is 4 elements (or double that with UTF8PROC_CHARBOUND), so this is a good default size. However, this may increase in future Unicode versions, so you should always check the return value as described below.

Returns: In case of success, the number of codepoints written is returned; in case of an error, a negative error code is returned (utf8proc_errmsg()).

: If the number of written codepoints would be bigger than bufsize, the required buffer size is returned, while the buffer will be overwritten with undefined data.

◆ utf8proc_decompose_custom()

utf8proc_ssize_t utf8proc_decompose_custom	(	const utf8proc_uint8_t *	str,
		utf8proc_ssize_t	strlen,
		utf8proc_int32_t *	buffer,
		utf8proc_ssize_t	bufsize,
		utf8proc_option_t	options,
		utf8proc_custom_func	custom_func,
		void *	custom_data )

The same as utf8proc_decompose(), but also takes a custom_func mapping function that is called on each codepoint in str before any other transformations (along with a custom_data pointer that is passed through to custom_func). The custom_func argument is ignored if it is NULL. See also utf8proc_map_custom().

◆ utf8proc_encode_char()

utf8proc_ssize_t utf8proc_encode_char	(	utf8proc_int32_t	codepoint,
		utf8proc_uint8_t *	dst )

Encodes the codepoint as an UTF-8 string in the byte array pointed to by dst. This array must be at least 4 bytes long.

In case of success the number of bytes written is returned, and otherwise 0 is returned.

This function does not check whether codepoint is valid Unicode.

◆ utf8proc_errmsg()

const char * utf8proc_errmsg ( utf8proc_ssize_t errcode )

Returns an informative error string for the given utf8proc error code (e.g. the error codes returned by utf8proc_map()).

◆ utf8proc_get_property()

const utf8proc_property_t * utf8proc_get_property ( utf8proc_int32_t codepoint )

Look up the properties for a given codepoint.

Parameters

codepoint The Unicode codepoint.

Returns: A pointer to a (constant) struct containing information about the codepoint.

: If the codepoint is unassigned or invalid, a pointer to a special struct is returned in which category is 0 (UTF8PROC_CATEGORY_CN).

◆ utf8proc_grapheme_break()

utf8proc_bool utf8proc_grapheme_break	(	utf8proc_int32_t	codepoint1,
		utf8proc_int32_t	codepoint2 )

Same as utf8proc_grapheme_break_stateful(), except without support for the Unicode 9 additions to the algorithm. Supported for legacy reasons.

◆ utf8proc_grapheme_break_stateful()

utf8proc_bool utf8proc_grapheme_break_stateful	(	utf8proc_int32_t	codepoint1,
		utf8proc_int32_t	codepoint2,
		utf8proc_int32_t *	state )

Given a pair of consecutive codepoints, return whether a grapheme break is permitted between them (as defined by the extended grapheme clusters in UAX#29).

Parameters

codepoint1	The first codepoint.
codepoint2	The second codepoint, occurring consecutively after codepoint1.
state	Beginning with Version 29 (Unicode 9.0.0), this algorithm requires state to break graphemes. This state can be passed in as a pointer in the state argument and should initially be set to 0. If the state is not passed in (i.e. a null pointer is passed), UAX#29 rules GB10/12/13 which require this state will not be applied, essentially matching the rules in Unicode 8.0.0.

Warning: If the state parameter is used, utf8proc_grapheme_break_stateful must be called IN ORDER on ALL potential breaks in a string. However, it is safe to reset the state to zero after a grapheme break.

◆ utf8proc_islower()

int utf8proc_islower ( utf8proc_int32_t c )

Given a codepoint c, return 1 if the codepoint corresponds to a lower-case character and 0 otherwise.

◆ utf8proc_isupper()

int utf8proc_isupper ( utf8proc_int32_t c )

Given a codepoint c, return 1 if the codepoint corresponds to an upper-case character and 0 otherwise.

◆ utf8proc_iterate()

utf8proc_ssize_t utf8proc_iterate	(	const utf8proc_uint8_t *	str,
		utf8proc_ssize_t	strlen,
		utf8proc_int32_t *	codepoint_ref )

Reads a single codepoint from the UTF-8 sequence being pointed to by str. The maximum number of bytes read is strlen, unless strlen is negative (in which case up to 4 bytes are read).

If a valid codepoint could be read, it is stored in the variable pointed to by codepoint_ref, otherwise that variable will be set to -1. In case of success, the number of bytes read is returned; otherwise, a negative error code is returned.

◆ utf8proc_map()

utf8proc_ssize_t utf8proc_map	(	const utf8proc_uint8_t *	str,
		utf8proc_ssize_t	strlen,
		utf8proc_uint8_t **	dstptr,
		utf8proc_option_t	options )

Maps the given UTF-8 string pointed to by str to a new UTF-8 string, allocated dynamically by malloc and returned via dstptr.

If the UTF8PROC_NULLTERM flag in the options field is set, the length is determined by a NULL terminator, otherwise the parameter strlen is evaluated to determine the string length, but in any case the result will be NULL terminated (though it might contain NULL characters with the string if str contained NULL characters). Other flags in the options field are passed to the functions defined above, and regarded as described. See also utf8proc_map_custom() to supply a custom codepoint transformation.

In case of success the length of the new string is returned, otherwise a negative error code is returned.

Note: The memory of the new UTF-8 string will have been allocated with malloc, and should therefore be deallocated with free.; utf8proc_map simply calls utf8proc_decompose followed by utf8proc_reencode, and applications requiring greater control over memory allocation should instead call those two functions directly.

◆ utf8proc_map_custom()

utf8proc_ssize_t utf8proc_map_custom	(	const utf8proc_uint8_t *	str,
		utf8proc_ssize_t	strlen,
		utf8proc_uint8_t **	dstptr,
		utf8proc_option_t	options,
		utf8proc_custom_func	custom_func,
		void *	custom_data )

Like utf8proc_map(), but also takes a custom_func mapping function that is called on each codepoint in str before any other transformations (along with a custom_data pointer that is passed through to custom_func). The custom_func argument is ignored if it is NULL.

◆ utf8proc_NFC()

utf8proc_uint8_t * utf8proc_NFC ( const utf8proc_uint8_t * str )

NFC normalization (UTF8PROC_COMPOSE).

◆ utf8proc_NFD()

utf8proc_uint8_t * utf8proc_NFD ( const utf8proc_uint8_t * str )

NFD normalization (UTF8PROC_DECOMPOSE).

◆ utf8proc_NFKC()

utf8proc_uint8_t * utf8proc_NFKC ( const utf8proc_uint8_t * str )

NFKC normalization (UTF8PROC_COMPOSE and UTF8PROC_COMPAT).

◆ utf8proc_NFKC_Casefold()

utf8proc_uint8_t * utf8proc_NFKC_Casefold ( const utf8proc_uint8_t * str )

NFKC_Casefold normalization (UTF8PROC_COMPOSE and UTF8PROC_COMPAT and UTF8PROC_CASEFOLD and UTF8PROC_IGNORE).

◆ utf8proc_NFKD()

utf8proc_uint8_t * utf8proc_NFKD ( const utf8proc_uint8_t * str )

NFKD normalization (UTF8PROC_DECOMPOSE and UTF8PROC_COMPAT).

◆ utf8proc_normalize_utf32()

utf8proc_ssize_t utf8proc_normalize_utf32	(	utf8proc_int32_t *	buffer,
		utf8proc_ssize_t	length,
		utf8proc_option_t	options )

Normalizes the sequence of length codepoints pointed to by buffer in-place (i.e., the result is also stored in buffer).

Parameters

buffer	the (native-endian UTF-32) unicode codepoints to re-encode.
length	the length (in codepoints) of the buffer.
options	a bitwise or (\|) of one or more of the following flags: UTF8PROC_NLF2LS - convert LF, CRLF, CR and NEL into LS UTF8PROC_NLF2PS - convert LF, CRLF, CR and NEL into PS UTF8PROC_NLF2LF - convert LF, CRLF, CR and NEL into LF UTF8PROC_STRIPCC - strip or convert all non-affected control characters UTF8PROC_COMPOSE - try to combine decomposed codepoints into composite codepoints UTF8PROC_STABLE - prohibit combining characters that would violate the unicode versioning stability

Returns: In case of success, the length (in codepoints) of the normalized UTF-32 string is returned; otherwise, a negative error code is returned (utf8proc_errmsg()).

Warning: The entries of the array pointed to by str have to be in the range 0x0000 to 0x10FFFF. Otherwise, the program might crash!

◆ utf8proc_reencode()

utf8proc_ssize_t utf8proc_reencode	(	utf8proc_int32_t *	buffer,
		utf8proc_ssize_t	length,
		utf8proc_option_t	options )

Reencodes the sequence of length codepoints pointed to by buffer UTF-8 data in-place (i.e., the result is also stored in buffer). Can optionally normalize the UTF-32 sequence prior to UTF-8 conversion.

Parameters

buffer	the (native-endian UTF-32) unicode codepoints to re-encode.
length	the length (in codepoints) of the buffer.
options	a bitwise or (\|) of one or more of the following flags: UTF8PROC_NLF2LS - convert LF, CRLF, CR and NEL into LS UTF8PROC_NLF2PS - convert LF, CRLF, CR and NEL into PS UTF8PROC_NLF2LF - convert LF, CRLF, CR and NEL into LF UTF8PROC_STRIPCC - strip or convert all non-affected control characters UTF8PROC_COMPOSE - try to combine decomposed codepoints into composite codepoints UTF8PROC_STABLE - prohibit combining characters that would violate the unicode versioning stability UTF8PROC_CHARBOUND - insert 0xFF bytes before each grapheme cluster

Returns: In case of success, the length (in bytes) of the resulting nul-terminated UTF-8 string is returned; otherwise, a negative error code is returned (utf8proc_errmsg()).

Warning: The amount of free space pointed to by buffer must exceed the amount of the input data by one byte, and the entries of the array pointed to by str have to be in the range 0x0000 to 0x10FFFF. Otherwise, the program might crash!

◆ utf8proc_tolower()

utf8proc_int32_t utf8proc_tolower ( utf8proc_int32_t c )

Given a codepoint c, return the codepoint of the corresponding lower-case character, if any; otherwise (if there is no lower-case variant, or if c is not a valid codepoint) return c.

◆ utf8proc_totitle()

utf8proc_int32_t utf8proc_totitle ( utf8proc_int32_t c )

Given a codepoint c, return the codepoint of the corresponding title-case character, if any; otherwise (if there is no title-case variant, or if c is not a valid codepoint) return c.

◆ utf8proc_toupper()

utf8proc_int32_t utf8proc_toupper ( utf8proc_int32_t c )

Given a codepoint c, return the codepoint of the corresponding upper-case character, if any; otherwise (if there is no upper-case variant, or if c is not a valid codepoint) return c.

◆ utf8proc_unicode_version()

const char * utf8proc_unicode_version ( void )

Returns the utf8proc supported Unicode version as a string MAJOR.MINOR.PATCH.

◆ utf8proc_version()

const char * utf8proc_version ( void )

Returns the utf8proc API version as a string MAJOR.MINOR.PATCH (http://semver.org format), possibly with a "-dev" suffix for development versions.

Variable Documentation

◆ utf8proc_utf8class

const utf8proc_int8_t utf8proc_utf8class[256]

extern

Array containing the byte lengths of a UTF-8 encoded codepoint based on the first byte.

Data Structures

Macros

Typedefs

Enumerations

Functions

Variables

Macro Definition Documentation

◆ UTF8PROC_ERROR_INVALIDOPTS

◆ UTF8PROC_ERROR_INVALIDUTF8

◆ UTF8PROC_ERROR_NOMEM

◆ UTF8PROC_ERROR_NOTASSIGNED

◆ UTF8PROC_ERROR_OVERFLOW

◆ UTF8PROC_VERSION_MAJOR

◆ UTF8PROC_VERSION_MINOR

◆ UTF8PROC_VERSION_PATCH

Typedef Documentation

◆ utf8proc_custom_func

◆ utf8proc_property_t

◆ utf8proc_propval_t

Enumeration Type Documentation

◆ utf8proc_bidi_class_t

◆ utf8proc_boundclass_t

◆ utf8proc_category_t

◆ utf8proc_decomp_type_t

◆ utf8proc_indic_conjunct_break_t

◆ utf8proc_option_t

Function Documentation

◆ utf8proc_category()

◆ utf8proc_category_string()

◆ utf8proc_charwidth()

◆ utf8proc_charwidth_ambiguous()

◆ utf8proc_codepoint_valid()

◆ utf8proc_decompose()

◆ utf8proc_decompose_char()

◆ utf8proc_decompose_custom()

◆ utf8proc_encode_char()

◆ utf8proc_errmsg()

◆ utf8proc_get_property()

◆ utf8proc_grapheme_break()

◆ utf8proc_grapheme_break_stateful()

◆ utf8proc_islower()

◆ utf8proc_isupper()

◆ utf8proc_iterate()

◆ utf8proc_map()

◆ utf8proc_map_custom()

◆ utf8proc_NFC()

◆ utf8proc_NFD()

◆ utf8proc_NFKC()

◆ utf8proc_NFKC_Casefold()

◆ utf8proc_NFKD()

◆ utf8proc_normalize_utf32()

◆ utf8proc_reencode()

◆ utf8proc_tolower()

◆ utf8proc_totitle()

◆ utf8proc_toupper()

◆ utf8proc_unicode_version()

◆ utf8proc_version()

Variable Documentation

◆ utf8proc_utf8class