utf8proc
C library for processing UTF-8 Unicode data
|
#include <stdlib.h>
#include <stddef.h>
#include <stdbool.h>
#include <stdint.h>
#include <limits.h>
Go to the source code of this file.
Data Structures | |
struct | utf8proc_property_struct |
Macros | |
#define | UTF8PROC_DLLEXPORT |
API version | |
The utf8proc API version MAJOR.MINOR.PATCH, following semantic-versioning rules (http://semver.org) based on API compatibility. This is also returned at runtime by utf8proc_version(); however, the runtime version may append a string like "-dev" to the version number for prerelease versions.
| |
#define | UTF8PROC_VERSION_MAJOR 2 |
#define | UTF8PROC_VERSION_MINOR 10 |
#define | UTF8PROC_VERSION_PATCH 0 |
Error codes | |
Error codes being returned by almost all functions. | |
#define | UTF8PROC_ERROR_NOMEM -1 |
#define | UTF8PROC_ERROR_OVERFLOW -2 |
#define | UTF8PROC_ERROR_INVALIDUTF8 -3 |
#define | UTF8PROC_ERROR_NOTASSIGNED -4 |
#define | UTF8PROC_ERROR_INVALIDOPTS -5 |
Typedefs | |
typedef int8_t | utf8proc_int8_t |
typedef uint8_t | utf8proc_uint8_t |
typedef int16_t | utf8proc_int16_t |
typedef uint16_t | utf8proc_uint16_t |
typedef int32_t | utf8proc_int32_t |
typedef uint32_t | utf8proc_uint32_t |
typedef size_t | utf8proc_size_t |
typedef ptrdiff_t | utf8proc_ssize_t |
typedef bool | utf8proc_bool |
typedef utf8proc_int16_t | utf8proc_propval_t |
typedef struct utf8proc_property_struct | utf8proc_property_t |
typedef utf8proc_int32_t(* | utf8proc_custom_func) (utf8proc_int32_t codepoint, void *data) |
Functions | |
const char * | utf8proc_version (void) |
const char * | utf8proc_unicode_version (void) |
const char * | utf8proc_errmsg (utf8proc_ssize_t errcode) |
utf8proc_ssize_t | utf8proc_iterate (const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *codepoint_ref) |
utf8proc_bool | utf8proc_codepoint_valid (utf8proc_int32_t codepoint) |
utf8proc_ssize_t | utf8proc_encode_char (utf8proc_int32_t codepoint, utf8proc_uint8_t *dst) |
const utf8proc_property_t * | utf8proc_get_property (utf8proc_int32_t codepoint) |
utf8proc_ssize_t | utf8proc_decompose_char (utf8proc_int32_t codepoint, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass) |
utf8proc_ssize_t | utf8proc_decompose (const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options) |
utf8proc_ssize_t | utf8proc_decompose_custom (const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options, utf8proc_custom_func custom_func, void *custom_data) |
utf8proc_ssize_t | utf8proc_normalize_utf32 (utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) |
utf8proc_ssize_t | utf8proc_reencode (utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) |
utf8proc_bool | utf8proc_grapheme_break_stateful (utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2, utf8proc_int32_t *state) |
utf8proc_bool | utf8proc_grapheme_break (utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2) |
utf8proc_int32_t | utf8proc_tolower (utf8proc_int32_t c) |
utf8proc_int32_t | utf8proc_toupper (utf8proc_int32_t c) |
utf8proc_int32_t | utf8proc_totitle (utf8proc_int32_t c) |
int | utf8proc_islower (utf8proc_int32_t c) |
int | utf8proc_isupper (utf8proc_int32_t c) |
int | utf8proc_charwidth (utf8proc_int32_t codepoint) |
utf8proc_bool | utf8proc_charwidth_ambiguous (utf8proc_int32_t codepoint) |
utf8proc_category_t | utf8proc_category (utf8proc_int32_t codepoint) |
const char * | utf8proc_category_string (utf8proc_int32_t codepoint) |
utf8proc_ssize_t | utf8proc_map (const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options) |
utf8proc_ssize_t | utf8proc_map_custom (const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options, utf8proc_custom_func custom_func, void *custom_data) |
Unicode normalization | |
Returns a pointer to newly allocated memory of a NFD, NFC, NFKD, NFKC or NFKC_Casefold normalized version of the null-terminated string | |
utf8proc_uint8_t * | utf8proc_NFD (const utf8proc_uint8_t *str) |
utf8proc_uint8_t * | utf8proc_NFC (const utf8proc_uint8_t *str) |
utf8proc_uint8_t * | utf8proc_NFKD (const utf8proc_uint8_t *str) |
utf8proc_uint8_t * | utf8proc_NFKC (const utf8proc_uint8_t *str) |
utf8proc_uint8_t * | utf8proc_NFKC_Casefold (const utf8proc_uint8_t *str) |
Variables | |
const utf8proc_int8_t | utf8proc_utf8class [256] |
#define UTF8PROC_ERROR_INVALIDOPTS -5 |
Invalid options have been used.
#define UTF8PROC_ERROR_INVALIDUTF8 -3 |
The given string is not a legal UTF-8 string.
#define UTF8PROC_ERROR_NOMEM -1 |
Memory could not be allocated.
#define UTF8PROC_ERROR_NOTASSIGNED -4 |
The UTF8PROC_REJECTNA flag was set and an unassigned codepoint was found.
#define UTF8PROC_ERROR_OVERFLOW -2 |
The given string is too long to be processed.
#define UTF8PROC_VERSION_MAJOR 2 |
The MAJOR version number (increased when backwards API compatibility is broken).
#define UTF8PROC_VERSION_MINOR 10 |
The MINOR version number (increased when new functionality is added in a backwards-compatible manner).
#define UTF8PROC_VERSION_PATCH 0 |
The PATCH version (increased for fixes that do not change the API).
typedef utf8proc_int32_t(* utf8proc_custom_func) (utf8proc_int32_t codepoint, void *data) |
Function pointer type passed to utf8proc_map_custom() and utf8proc_decompose_custom(), which is used to specify a user-defined mapping of codepoints to be applied in conjunction with other mappings.
typedef struct utf8proc_property_struct utf8proc_property_t |
Struct containing information about a codepoint.
typedef utf8proc_int16_t utf8proc_propval_t |
Holds the value of a property.
Bidirectional character classes.
Boundclass property. (TR29)
enum utf8proc_category_t |
Unicode categories.
Decomposition type.
Indic_Conjunct_Break property. (TR44)
enum utf8proc_option_t |
Option flags used by several functions in the library.
Enumerator | |
---|---|
UTF8PROC_NULLTERM | The given UTF-8 input is NULL terminated. |
UTF8PROC_STABLE | Unicode Versioning Stability has to be respected. |
UTF8PROC_COMPAT | Compatibility decomposition (i.e. formatting information is lost). |
UTF8PROC_COMPOSE | Return a result with decomposed characters. |
UTF8PROC_DECOMPOSE | Return a result with decomposed characters. |
UTF8PROC_IGNORE | Strip "default ignorable characters" such as SOFT-HYPHEN or ZERO-WIDTH-SPACE. |
UTF8PROC_REJECTNA | Return an error, if the input contains unassigned codepoints. |
UTF8PROC_NLF2LS | Indicating that NLF-sequences (LF, CRLF, CR, NEL) are representing a line break, and should be converted to the codepoint for line separation (LS). |
UTF8PROC_NLF2PS | Indicating that NLF-sequences are representing a paragraph break, and should be converted to the codepoint for paragraph separation (PS). |
UTF8PROC_NLF2LF | Indicating that the meaning of NLF-sequences is unknown. |
UTF8PROC_STRIPCC | Strips and/or convers control characters. NLF-sequences are transformed into space, except if one of the NLF2LS/PS/LF options is given. HorizontalTab (HT) and FormFeed (FF) are treated as a NLF-sequence in this case. All other control characters are simply removed. |
UTF8PROC_CASEFOLD | Performs unicode case folding, to be able to do a case-insensitive string comparison. |
UTF8PROC_CHARBOUND | Inserts 0xFF bytes at the beginning of each sequence which is representing a single grapheme cluster (see UAX#29). |
UTF8PROC_LUMP | Lumps certain characters together. E.g. HYPHEN U+2010 and MINUS U+2212 to ASCII "-". See lump.md for details. If NLF2LF is set, this includes a transformation of paragraph and line separators to ASCII line-feed (LF). |
UTF8PROC_STRIPMARK | Strips all character markings. This includes non-spacing, spacing and enclosing (i.e. accents).
|
UTF8PROC_STRIPNA | Strip unassigned codepoints. |
utf8proc_category_t utf8proc_category | ( | utf8proc_int32_t | codepoint | ) |
Return the Unicode category for the codepoint (one of the utf8proc_category_t constants.)
const char * utf8proc_category_string | ( | utf8proc_int32_t | codepoint | ) |
Return the two-letter (nul-terminated) Unicode category string for the codepoint (e.g. "Lu"
or "Co"
).
int utf8proc_charwidth | ( | utf8proc_int32_t | codepoint | ) |
Given a codepoint, return a character width analogous to wcwidth(codepoint)
, except that a width of 0 is returned for non-printable codepoints instead of -1 as in wcwidth
.
isprint
or iscntrl
), use utf8proc_category(). utf8proc_bool utf8proc_charwidth_ambiguous | ( | utf8proc_int32_t | codepoint | ) |
Given a codepoint, return whether it has East Asian width class A (Ambiguous)
Codepoints with this property are considered to have charwidth 1 (if they are printable) but some East Asian fonts render them as double width.
utf8proc_bool utf8proc_codepoint_valid | ( | utf8proc_int32_t | codepoint | ) |
Check if a codepoint is valid (regardless of whether it has been assigned a value by the current Unicode standard).
codepoint
is valid and otherwise return 0. utf8proc_ssize_t utf8proc_decompose | ( | const utf8proc_uint8_t * | str, |
utf8proc_ssize_t | strlen, | ||
utf8proc_int32_t * | buffer, | ||
utf8proc_ssize_t | bufsize, | ||
utf8proc_option_t | options ) |
The same as utf8proc_decompose_char(), but acts on a whole UTF-8 string and orders the decomposed sequences correctly.
If the UTF8PROC_NULLTERM flag in options
is set, processing will be stopped, when a NULL byte is encountered, otherwise strlen
bytes are processed. The result (in the form of 32-bit unicode codepoints) is written into the buffer being pointed to by buffer
(which must contain at least bufsize
entries). In case of success, the number of codepoints written is returned; in case of an error, a negative error code is returned (utf8proc_errmsg()). See utf8proc_decompose_custom() to supply additional transformations.
If the number of written codepoints would be bigger than bufsize
, the required buffer size is returned, while the buffer will be overwritten with undefined data.
utf8proc_ssize_t utf8proc_decompose_char | ( | utf8proc_int32_t | codepoint, |
utf8proc_int32_t * | dst, | ||
utf8proc_ssize_t | bufsize, | ||
utf8proc_option_t | options, | ||
int * | last_boundclass ) |
Decompose a codepoint into an array of codepoints.
codepoint | the codepoint. |
dst | the destination buffer. |
bufsize | the size of the destination buffer. |
options | one or more of the following flags:
|
last_boundclass | Pointer to an integer variable containing the previous codepoint's (boundclass + indic_conjunct_break << 1) if the UTF8PROC_CHARBOUND option is used. If the string is being processed in order, this can be initialized to 0 for the beginning of the string, and is thereafter updated automatically. Otherwise, this parameter is ignored. |
bufsize
, the required buffer size is returned, while the buffer will be overwritten with undefined data. utf8proc_ssize_t utf8proc_decompose_custom | ( | const utf8proc_uint8_t * | str, |
utf8proc_ssize_t | strlen, | ||
utf8proc_int32_t * | buffer, | ||
utf8proc_ssize_t | bufsize, | ||
utf8proc_option_t | options, | ||
utf8proc_custom_func | custom_func, | ||
void * | custom_data ) |
The same as utf8proc_decompose(), but also takes a custom_func
mapping function that is called on each codepoint in str
before any other transformations (along with a custom_data
pointer that is passed through to custom_func
). The custom_func
argument is ignored if it is NULL
. See also utf8proc_map_custom().
utf8proc_ssize_t utf8proc_encode_char | ( | utf8proc_int32_t | codepoint, |
utf8proc_uint8_t * | dst ) |
Encodes the codepoint as an UTF-8 string in the byte array pointed to by dst
. This array must be at least 4 bytes long.
In case of success the number of bytes written is returned, and otherwise 0 is returned.
This function does not check whether codepoint
is valid Unicode.
const char * utf8proc_errmsg | ( | utf8proc_ssize_t | errcode | ) |
Returns an informative error string for the given utf8proc error code (e.g. the error codes returned by utf8proc_map()).
const utf8proc_property_t * utf8proc_get_property | ( | utf8proc_int32_t | codepoint | ) |
Look up the properties for a given codepoint.
codepoint | The Unicode codepoint. |
category
is 0 (UTF8PROC_CATEGORY_CN). utf8proc_bool utf8proc_grapheme_break | ( | utf8proc_int32_t | codepoint1, |
utf8proc_int32_t | codepoint2 ) |
Same as utf8proc_grapheme_break_stateful(), except without support for the Unicode 9 additions to the algorithm. Supported for legacy reasons.
utf8proc_bool utf8proc_grapheme_break_stateful | ( | utf8proc_int32_t | codepoint1, |
utf8proc_int32_t | codepoint2, | ||
utf8proc_int32_t * | state ) |
Given a pair of consecutive codepoints, return whether a grapheme break is permitted between them (as defined by the extended grapheme clusters in UAX#29).
codepoint1 | The first codepoint. |
codepoint2 | The second codepoint, occurring consecutively after codepoint1 . |
state | Beginning with Version 29 (Unicode 9.0.0), this algorithm requires state to break graphemes. This state can be passed in as a pointer in the state argument and should initially be set to 0. If the state is not passed in (i.e. a null pointer is passed), UAX#29 rules GB10/12/13 which require this state will not be applied, essentially matching the rules in Unicode 8.0.0. |
utf8proc_grapheme_break_stateful
must be called IN ORDER on ALL potential breaks in a string. However, it is safe to reset the state to zero after a grapheme break. int utf8proc_islower | ( | utf8proc_int32_t | c | ) |
Given a codepoint c
, return 1
if the codepoint corresponds to a lower-case character and 0
otherwise.
int utf8proc_isupper | ( | utf8proc_int32_t | c | ) |
Given a codepoint c
, return 1
if the codepoint corresponds to an upper-case character and 0
otherwise.
utf8proc_ssize_t utf8proc_iterate | ( | const utf8proc_uint8_t * | str, |
utf8proc_ssize_t | strlen, | ||
utf8proc_int32_t * | codepoint_ref ) |
Reads a single codepoint from the UTF-8 sequence being pointed to by str
. The maximum number of bytes read is strlen
, unless strlen
is negative (in which case up to 4 bytes are read).
If a valid codepoint could be read, it is stored in the variable pointed to by codepoint_ref
, otherwise that variable will be set to -1. In case of success, the number of bytes read is returned; otherwise, a negative error code is returned.
utf8proc_ssize_t utf8proc_map | ( | const utf8proc_uint8_t * | str, |
utf8proc_ssize_t | strlen, | ||
utf8proc_uint8_t ** | dstptr, | ||
utf8proc_option_t | options ) |
Maps the given UTF-8 string pointed to by str
to a new UTF-8 string, allocated dynamically by malloc
and returned via dstptr
.
If the UTF8PROC_NULLTERM flag in the options
field is set, the length is determined by a NULL terminator, otherwise the parameter strlen
is evaluated to determine the string length, but in any case the result will be NULL terminated (though it might contain NULL characters with the string if str
contained NULL characters). Other flags in the options
field are passed to the functions defined above, and regarded as described. See also utf8proc_map_custom() to supply a custom codepoint transformation.
In case of success the length of the new string is returned, otherwise a negative error code is returned.
malloc
, and should therefore be deallocated with free
. utf8proc_ssize_t utf8proc_map_custom | ( | const utf8proc_uint8_t * | str, |
utf8proc_ssize_t | strlen, | ||
utf8proc_uint8_t ** | dstptr, | ||
utf8proc_option_t | options, | ||
utf8proc_custom_func | custom_func, | ||
void * | custom_data ) |
Like utf8proc_map(), but also takes a custom_func
mapping function that is called on each codepoint in str
before any other transformations (along with a custom_data
pointer that is passed through to custom_func
). The custom_func
argument is ignored if it is NULL
.
utf8proc_uint8_t * utf8proc_NFC | ( | const utf8proc_uint8_t * | str | ) |
NFC normalization (UTF8PROC_COMPOSE).
utf8proc_uint8_t * utf8proc_NFD | ( | const utf8proc_uint8_t * | str | ) |
NFD normalization (UTF8PROC_DECOMPOSE).
utf8proc_uint8_t * utf8proc_NFKC | ( | const utf8proc_uint8_t * | str | ) |
NFKC normalization (UTF8PROC_COMPOSE and UTF8PROC_COMPAT).
utf8proc_uint8_t * utf8proc_NFKC_Casefold | ( | const utf8proc_uint8_t * | str | ) |
NFKC_Casefold normalization (UTF8PROC_COMPOSE and UTF8PROC_COMPAT and UTF8PROC_CASEFOLD and UTF8PROC_IGNORE).
utf8proc_uint8_t * utf8proc_NFKD | ( | const utf8proc_uint8_t * | str | ) |
NFKD normalization (UTF8PROC_DECOMPOSE and UTF8PROC_COMPAT).
utf8proc_ssize_t utf8proc_normalize_utf32 | ( | utf8proc_int32_t * | buffer, |
utf8proc_ssize_t | length, | ||
utf8proc_option_t | options ) |
Normalizes the sequence of length
codepoints pointed to by buffer
in-place (i.e., the result is also stored in buffer
).
buffer | the (native-endian UTF-32) unicode codepoints to re-encode. |
length | the length (in codepoints) of the buffer. |
options | a bitwise or (| ) of one or more of the following flags:
|
str
have to be in the range 0x0000
to 0x10FFFF
. Otherwise, the program might crash! utf8proc_ssize_t utf8proc_reencode | ( | utf8proc_int32_t * | buffer, |
utf8proc_ssize_t | length, | ||
utf8proc_option_t | options ) |
Reencodes the sequence of length
codepoints pointed to by buffer
UTF-8 data in-place (i.e., the result is also stored in buffer
). Can optionally normalize the UTF-32 sequence prior to UTF-8 conversion.
buffer | the (native-endian UTF-32) unicode codepoints to re-encode. |
length | the length (in codepoints) of the buffer. |
options | a bitwise or (| ) of one or more of the following flags:
|
buffer
must exceed the amount of the input data by one byte, and the entries of the array pointed to by str
have to be in the range 0x0000
to 0x10FFFF
. Otherwise, the program might crash! utf8proc_int32_t utf8proc_tolower | ( | utf8proc_int32_t | c | ) |
Given a codepoint c
, return the codepoint of the corresponding lower-case character, if any; otherwise (if there is no lower-case variant, or if c
is not a valid codepoint) return c
.
utf8proc_int32_t utf8proc_totitle | ( | utf8proc_int32_t | c | ) |
Given a codepoint c
, return the codepoint of the corresponding title-case character, if any; otherwise (if there is no title-case variant, or if c
is not a valid codepoint) return c
.
utf8proc_int32_t utf8proc_toupper | ( | utf8proc_int32_t | c | ) |
Given a codepoint c
, return the codepoint of the corresponding upper-case character, if any; otherwise (if there is no upper-case variant, or if c
is not a valid codepoint) return c
.
const char * utf8proc_unicode_version | ( | void | ) |
Returns the utf8proc supported Unicode version as a string MAJOR.MINOR.PATCH.
const char * utf8proc_version | ( | void | ) |
Returns the utf8proc API version as a string MAJOR.MINOR.PATCH (http://semver.org format), possibly with a "-dev" suffix for development versions.
|
extern |
Array containing the byte lengths of a UTF-8 encoded codepoint based on the first byte.