mirror of
https://github.com/asterisk/asterisk.git
synced 2025-09-05 12:16:00 +00:00
utf8.c: Add UTF-8 validation and utility functions
There are various places in Asterisk - specifically in regards to database integration - where having some kind of UTF-8 validation would be beneficial. This patch adds: * Functions to validate that a given string contains only valid UTF-8 sequences. * A function to copy a string (similar to ast_copy_string) stopping when an invalid UTF-8 sequence is encountered. * A UTF-8 validator that allows for progressive validation. All of this is based on the excellent UTF-8 decoder by Björn Höhrmann. More information is available here: https://bjoern.hoehrmann.de/utf-8/decoder/dfa/ The API was written in such a way that should allow us to replace the implementation later should we determine that we need something more comprehensive. Change-Id: I3555d787a79e7c780a7800cd26e0b5056368abf9
This commit is contained in:
committed by
Kevin Harwell
parent
2e32b56bdb
commit
d9ae902f52
188
include/asterisk/utf8.h
Normal file
188
include/asterisk/utf8.h
Normal file
@@ -0,0 +1,188 @@
|
|||||||
|
/*
|
||||||
|
* Asterisk -- An open source telephony toolkit.
|
||||||
|
*
|
||||||
|
* Copyright (C) 2020, Sean Bright
|
||||||
|
*
|
||||||
|
* Sean Bright <sean.bright@gmail.com>
|
||||||
|
*
|
||||||
|
* See http://www.asterisk.org for more information about
|
||||||
|
* the Asterisk project. Please do not directly contact
|
||||||
|
* any of the maintainers of this project for assistance;
|
||||||
|
* the project provides a web site, mailing lists and IRC
|
||||||
|
* channels for your use.
|
||||||
|
*
|
||||||
|
* This program is free software, distributed under the terms of
|
||||||
|
* the GNU General Public License Version 2. See the LICENSE file
|
||||||
|
* at the top of the source tree.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*! \file
|
||||||
|
*
|
||||||
|
* \brief UTF-8 information and validation functions
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef ASTERISK_UTF8_H
|
||||||
|
#define ASTERISK_UTF8_H
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* \brief Check if a zero-terminated string is valid UTF-8
|
||||||
|
* \since 13.36.0, 16.13.0, 17.7.0, 18.0.0
|
||||||
|
*
|
||||||
|
* \param str The zero-terminated string to check
|
||||||
|
*
|
||||||
|
* \retval 0 if the string is not valid UTF-8
|
||||||
|
* \retval Non-zero if the string is valid UTF-8
|
||||||
|
*/
|
||||||
|
int ast_utf8_is_valid(const char *str);
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* \brief Check if the first \a size bytes of a string are valid UTF-8
|
||||||
|
* \since 13.36.0, 16.13.0, 17.7.0, 18.0.0
|
||||||
|
*
|
||||||
|
* Similar to \a ast_utf8_is_valid() but checks the first \a size bytes or until
|
||||||
|
* a zero byte is reached, whichever comes first.
|
||||||
|
*
|
||||||
|
* \param str The string to check
|
||||||
|
* \param size The number of bytes to evaluate
|
||||||
|
*
|
||||||
|
* \retval 0 if the string is not valid UTF-8
|
||||||
|
* \retval Non-zero if the string is valid UTF-8
|
||||||
|
*/
|
||||||
|
int ast_utf8_is_validn(const char *str, size_t size);
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* \brief Copy a string safely ensuring valid UTF-8
|
||||||
|
* \since 13.36.0, 16.13.0, 17.7.0, 18.0.0
|
||||||
|
*
|
||||||
|
* This is similar to \a ast_copy_string, but it will only copy valid UTF-8
|
||||||
|
* sequences from the source string into the destination buffer. If an invalid
|
||||||
|
* UTF-8 sequence is encountered, or the available space in the destination
|
||||||
|
* buffer is exhausted in the middle of an otherwise valid UTF-8 sequence, the
|
||||||
|
* destination buffer will be truncated to ensure that it only contains valid
|
||||||
|
* UTF-8.
|
||||||
|
*
|
||||||
|
* \param dst The destination buffer.
|
||||||
|
* \param src The source string
|
||||||
|
* \param size The size of the destination buffer
|
||||||
|
* \return Nothing.
|
||||||
|
*/
|
||||||
|
void ast_utf8_copy_string(char *dst, const char *src, size_t size);
|
||||||
|
|
||||||
|
enum ast_utf8_validation_result {
|
||||||
|
/*! \brief The consumed sequence is valid UTF-8
|
||||||
|
*
|
||||||
|
* The bytes consumed thus far by the validator represent a valid sequence of
|
||||||
|
* UTF-8 bytes. If additional bytes are fed into the validator, it can
|
||||||
|
* transition into either \a AST_UTF8_INVALID or \a AST_UTF8_UNKNOWN
|
||||||
|
*/
|
||||||
|
AST_UTF8_VALID,
|
||||||
|
|
||||||
|
/*! \brief The consumed sequence is invalid UTF-8
|
||||||
|
*
|
||||||
|
* The bytes consumed thus far by the validator represent an invalid sequence
|
||||||
|
* of UTF-8 bytes. Feeding additional bytes into the validator will not
|
||||||
|
* change its state.
|
||||||
|
*/
|
||||||
|
AST_UTF8_INVALID,
|
||||||
|
|
||||||
|
/*! \brief The validator is in an intermediate state
|
||||||
|
*
|
||||||
|
* The validator is in the process of validating a multibyte UTF-8 sequence
|
||||||
|
* and requires additional data to be fed into it to determine validity. If
|
||||||
|
* additional bytes are fed into the validator, it can transition into either
|
||||||
|
* \a AST_UTF8_VALID or \a AST_UTF8_INVALID. If you have no additional data
|
||||||
|
* to feed into the validator the UTF-8 sequence is invalid.
|
||||||
|
*/
|
||||||
|
AST_UTF8_UNKNOWN,
|
||||||
|
};
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* \brief Opaque type for UTF-8 validator state.
|
||||||
|
* \since 13.36.0, 16.13.0, 17.7.0, 18.0.0
|
||||||
|
*/
|
||||||
|
struct ast_utf8_validator;
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* \brief Create a new UTF-8 validator
|
||||||
|
* \since 13.36.0, 16.13.0, 17.7.0, 18.0.0
|
||||||
|
*
|
||||||
|
* \param[out] validator The validator instance
|
||||||
|
*
|
||||||
|
* \retval 0 on success
|
||||||
|
* \retval -1 on failure
|
||||||
|
*/
|
||||||
|
int ast_utf8_validator_new(struct ast_utf8_validator **validator);
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* \brief Feed a zero-terminated string into the UTF-8 validator
|
||||||
|
* \since 13.36.0, 16.13.0, 17.7.0, 18.0.0
|
||||||
|
*
|
||||||
|
* \param validator The validator instance
|
||||||
|
* \param data The zero-terminated string to feed into the validator
|
||||||
|
*
|
||||||
|
* \return The \ref ast_utf8_validation_result indicating the current state of
|
||||||
|
* the validator.
|
||||||
|
*/
|
||||||
|
enum ast_utf8_validation_result ast_utf8_validator_feed(
|
||||||
|
struct ast_utf8_validator *validator, const char *data);
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* \brief Feed a string into the UTF-8 validator
|
||||||
|
* \since 13.36.0, 16.13.0, 17.7.0, 18.0.0
|
||||||
|
*
|
||||||
|
* Similar to \a ast_utf8_validator_feed but will stop feeding in data if a zero
|
||||||
|
* byte is encountered or \a size bytes have been read.
|
||||||
|
*
|
||||||
|
* \param validator The validator instance
|
||||||
|
* \param data The string to feed into the validator
|
||||||
|
* \param size The number of bytes to feed into the validator
|
||||||
|
*
|
||||||
|
* \return The \ref ast_utf8_validation_result indicating the current state of
|
||||||
|
* the validator.
|
||||||
|
*/
|
||||||
|
enum ast_utf8_validation_result ast_utf8_validator_feedn(
|
||||||
|
struct ast_utf8_validator *validator, const char *data, size_t size);
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* \brief Get the current UTF-8 validator state
|
||||||
|
* \since 13.36.0, 16.13.0, 17.7.0, 18.0.0
|
||||||
|
*
|
||||||
|
* \param validator The validator instance
|
||||||
|
*
|
||||||
|
* \return The \ref ast_utf8_validation_result indicating the current state of
|
||||||
|
* the validator.
|
||||||
|
*/
|
||||||
|
enum ast_utf8_validation_result ast_utf8_validator_state(
|
||||||
|
struct ast_utf8_validator *validator);
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* \brief Reset the state of a UTF-8 validator
|
||||||
|
* \since 13.36.0, 16.13.0, 17.7.0, 18.0.0
|
||||||
|
*
|
||||||
|
* Resets the provided UTF-8 validator to its initial state so that it can be
|
||||||
|
* reused.
|
||||||
|
*
|
||||||
|
* \param validator The validator instance to reset
|
||||||
|
*/
|
||||||
|
void ast_utf8_validator_reset(
|
||||||
|
struct ast_utf8_validator *validator);
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* \brief Destroy a UTF-8 validator
|
||||||
|
* \since 13.36.0, 16.13.0, 17.7.0, 18.0.0
|
||||||
|
*
|
||||||
|
* \param validator The validator instance to destroy
|
||||||
|
*/
|
||||||
|
void ast_utf8_validator_destroy(struct ast_utf8_validator *validator);
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* \brief Register UTF-8 tests
|
||||||
|
* \since 13.36.0, 16.13.0, 17.7.0, 18.0.0
|
||||||
|
*
|
||||||
|
* Does nothing unless TEST_FRAMEWORK is defined.
|
||||||
|
*
|
||||||
|
* \return Always returns 0
|
||||||
|
*/
|
||||||
|
int ast_utf8_init(void);
|
||||||
|
|
||||||
|
#endif /* ASTERISK_UTF8_H */
|
@@ -242,6 +242,7 @@ int daemon(int, int); /* defined in libresolv of all places */
|
|||||||
#include "asterisk/media_cache.h"
|
#include "asterisk/media_cache.h"
|
||||||
#include "asterisk/astdb.h"
|
#include "asterisk/astdb.h"
|
||||||
#include "asterisk/options.h"
|
#include "asterisk/options.h"
|
||||||
|
#include "asterisk/utf8.h"
|
||||||
|
|
||||||
#include "../defaults.h"
|
#include "../defaults.h"
|
||||||
|
|
||||||
@@ -4068,6 +4069,7 @@ static void asterisk_daemon(int isroot, const char *runuser, const char *rungrou
|
|||||||
check_init(ast_json_init(), "libjansson");
|
check_init(ast_json_init(), "libjansson");
|
||||||
ast_ulaw_init();
|
ast_ulaw_init();
|
||||||
ast_alaw_init();
|
ast_alaw_init();
|
||||||
|
ast_utf8_init();
|
||||||
tdd_init();
|
tdd_init();
|
||||||
callerid_init();
|
callerid_init();
|
||||||
ast_builtins_init();
|
ast_builtins_init();
|
||||||
|
380
main/utf8.c
Normal file
380
main/utf8.c
Normal file
@@ -0,0 +1,380 @@
|
|||||||
|
/*
|
||||||
|
* Asterisk -- An open source telephony toolkit.
|
||||||
|
*
|
||||||
|
* Copyright (C) 2020, Sean Bright
|
||||||
|
*
|
||||||
|
* Sean Bright <sean.bright@gmail.com>
|
||||||
|
*
|
||||||
|
* See http://www.asterisk.org for more information about
|
||||||
|
* the Asterisk project. Please do not directly contact
|
||||||
|
* any of the maintainers of this project for assistance;
|
||||||
|
* the project provides a web site, mailing lists and IRC
|
||||||
|
* channels for your use.
|
||||||
|
*
|
||||||
|
* This program is free software, distributed under the terms of
|
||||||
|
* the GNU General Public License Version 2. See the LICENSE file
|
||||||
|
* at the top of the source tree.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*! \file
|
||||||
|
*
|
||||||
|
* \brief UTF-8 information and validation functions
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*** MODULEINFO
|
||||||
|
<support_level>core</support_level>
|
||||||
|
***/
|
||||||
|
|
||||||
|
#include "asterisk.h"
|
||||||
|
|
||||||
|
#include "asterisk/utils.h"
|
||||||
|
#include "asterisk/utf8.h"
|
||||||
|
#include "asterisk/test.h"
|
||||||
|
|
||||||
|
/*
|
||||||
|
* BEGIN THIRD PARTY CODE
|
||||||
|
*
|
||||||
|
* Copyright (c) 2008-2010 Björn Höhrmann <bjoern@hoehrmann.de>
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
* of this software and associated documentation files (the "Software"), to deal
|
||||||
|
* in the Software without restriction, including without limitation the rights
|
||||||
|
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the Software is
|
||||||
|
* furnished to do so, subject to the following conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be included in all
|
||||||
|
* copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
* SOFTWARE.
|
||||||
|
*
|
||||||
|
* See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#define UTF8_ACCEPT 0
|
||||||
|
#define UTF8_REJECT 12
|
||||||
|
|
||||||
|
static const uint8_t utf8d[] = {
|
||||||
|
/* The first part of the table maps bytes to character classes that
|
||||||
|
* to reduce the size of the transition table and create bitmasks. */
|
||||||
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||||
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||||
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||||
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||||
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
|
||||||
|
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
|
||||||
|
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
|
||||||
|
10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
|
||||||
|
|
||||||
|
/* The second part is a transition table that maps a combination
|
||||||
|
* of a state of the automaton and a character class to a state. */
|
||||||
|
0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
|
||||||
|
12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
|
||||||
|
12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
|
||||||
|
12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
|
||||||
|
12,36,12,12,12,12,12,12,12,12,12,12,
|
||||||
|
};
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
/* We can bring this back if we need the codepoint? */
|
||||||
|
static uint32_t inline decode(uint32_t *state, uint32_t *codep, uint32_t byte) {
|
||||||
|
uint32_t type = utf8d[byte];
|
||||||
|
|
||||||
|
*codep = (*state != UTF8_ACCEPT) ?
|
||||||
|
(byte & 0x3fu) | (*codep << 6) :
|
||||||
|
(0xff >> type) & (byte);
|
||||||
|
|
||||||
|
*state = utf8d[256 + *state + type];
|
||||||
|
return *state;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
static uint32_t inline decode(uint32_t *state, uint32_t byte) {
|
||||||
|
uint32_t type = utf8d[byte];
|
||||||
|
*state = utf8d[256 + *state + type];
|
||||||
|
return *state;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* END THIRD PARTY CODE
|
||||||
|
*
|
||||||
|
* See copyright notice above.
|
||||||
|
*/
|
||||||
|
|
||||||
|
int ast_utf8_is_valid(const char *src)
|
||||||
|
{
|
||||||
|
uint32_t state = UTF8_ACCEPT;
|
||||||
|
|
||||||
|
while (*src) {
|
||||||
|
decode(&state, (uint8_t) *src++);
|
||||||
|
}
|
||||||
|
|
||||||
|
return state == UTF8_ACCEPT;
|
||||||
|
}
|
||||||
|
|
||||||
|
int ast_utf8_is_validn(const char *src, size_t size)
|
||||||
|
{
|
||||||
|
uint32_t state = UTF8_ACCEPT;
|
||||||
|
|
||||||
|
while (size && *src) {
|
||||||
|
decode(&state, (uint8_t) *src++);
|
||||||
|
size--;
|
||||||
|
}
|
||||||
|
|
||||||
|
return state == UTF8_ACCEPT;
|
||||||
|
}
|
||||||
|
|
||||||
|
void ast_utf8_copy_string(char *dst, const char *src, size_t size)
|
||||||
|
{
|
||||||
|
uint32_t state = UTF8_ACCEPT;
|
||||||
|
char *last_good = dst;
|
||||||
|
|
||||||
|
ast_assert(size > 0);
|
||||||
|
|
||||||
|
while (size && *src) {
|
||||||
|
if (decode(&state, (uint8_t) *src) == UTF8_REJECT) {
|
||||||
|
/* We _could_ replace with U+FFFD and try to recover, but for now
|
||||||
|
* we treat this the same as if we had run out of space */
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
*dst++ = *src++;
|
||||||
|
size--;
|
||||||
|
|
||||||
|
if (size && state == UTF8_ACCEPT) {
|
||||||
|
/* last_good is where we will ultimately write the 0 byte */
|
||||||
|
last_good = dst;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
*last_good = '\0';
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ast_utf8_validator {
|
||||||
|
uint32_t state;
|
||||||
|
};
|
||||||
|
|
||||||
|
int ast_utf8_validator_new(struct ast_utf8_validator **validator)
|
||||||
|
{
|
||||||
|
struct ast_utf8_validator *tmp = ast_malloc(sizeof(*tmp));
|
||||||
|
|
||||||
|
if (!tmp) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
tmp->state = UTF8_ACCEPT;
|
||||||
|
*validator = tmp;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
enum ast_utf8_validation_result ast_utf8_validator_state(
|
||||||
|
struct ast_utf8_validator *validator)
|
||||||
|
{
|
||||||
|
switch (validator->state) {
|
||||||
|
case UTF8_ACCEPT:
|
||||||
|
return AST_UTF8_VALID;
|
||||||
|
case UTF8_REJECT:
|
||||||
|
return AST_UTF8_INVALID;
|
||||||
|
default:
|
||||||
|
return AST_UTF8_UNKNOWN;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
enum ast_utf8_validation_result ast_utf8_validator_feed(
|
||||||
|
struct ast_utf8_validator *validator, const char *data)
|
||||||
|
{
|
||||||
|
while (*data) {
|
||||||
|
decode(&validator->state, (uint8_t) *data++);
|
||||||
|
}
|
||||||
|
|
||||||
|
return ast_utf8_validator_state(validator);
|
||||||
|
}
|
||||||
|
|
||||||
|
enum ast_utf8_validation_result ast_utf8_validator_feedn(
|
||||||
|
struct ast_utf8_validator *validator, const char *data, size_t size)
|
||||||
|
{
|
||||||
|
while (size && *data) {
|
||||||
|
decode(&validator->state, (uint8_t) *data++);
|
||||||
|
size--;
|
||||||
|
}
|
||||||
|
|
||||||
|
return ast_utf8_validator_state(validator);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ast_utf8_validator_reset(struct ast_utf8_validator *validator)
|
||||||
|
{
|
||||||
|
validator->state = UTF8_ACCEPT;
|
||||||
|
}
|
||||||
|
|
||||||
|
void ast_utf8_validator_destroy(struct ast_utf8_validator *validator)
|
||||||
|
{
|
||||||
|
ast_free(validator);
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef TEST_FRAMEWORK
|
||||||
|
|
||||||
|
AST_TEST_DEFINE(test_utf8_is_valid)
|
||||||
|
{
|
||||||
|
switch (cmd) {
|
||||||
|
case TEST_INIT:
|
||||||
|
info->name = "is_valid";
|
||||||
|
info->category = "/main/utf8/";
|
||||||
|
info->summary = "Test ast_utf8_is_valid and ast_utf8_is_validn";
|
||||||
|
info->description =
|
||||||
|
"Tests UTF-8 string validation code.";
|
||||||
|
return AST_TEST_NOT_RUN;
|
||||||
|
case TEST_EXECUTE:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Valid UTF-8 */
|
||||||
|
ast_test_validate(test, ast_utf8_is_valid("Asterisk"));
|
||||||
|
ast_test_validate(test, ast_utf8_is_valid("\xce\xbb"));
|
||||||
|
ast_test_validate(test, ast_utf8_is_valid("\xe2\x8a\x9b"));
|
||||||
|
ast_test_validate(test, ast_utf8_is_valid("\xf0\x9f\x93\x9e"));
|
||||||
|
|
||||||
|
/* Valid with leading */
|
||||||
|
ast_test_validate(test, ast_utf8_is_valid("aaa Asterisk"));
|
||||||
|
ast_test_validate(test, ast_utf8_is_valid("aaa \xce\xbb"));
|
||||||
|
ast_test_validate(test, ast_utf8_is_valid("aaa \xe2\x8a\x9b"));
|
||||||
|
ast_test_validate(test, ast_utf8_is_valid("aaa \xf0\x9f\x93\x9e"));
|
||||||
|
|
||||||
|
/* Valid with trailing */
|
||||||
|
ast_test_validate(test, ast_utf8_is_valid("Asterisk aaa"));
|
||||||
|
ast_test_validate(test, ast_utf8_is_valid("\xce\xbb aaa"));
|
||||||
|
ast_test_validate(test, ast_utf8_is_valid("\xe2\x8a\x9b aaa"));
|
||||||
|
ast_test_validate(test, ast_utf8_is_valid("\xf0\x9f\x93\x9e aaa"));
|
||||||
|
|
||||||
|
/* Valid with leading and trailing */
|
||||||
|
ast_test_validate(test, ast_utf8_is_valid("aaa Asterisk aaa"));
|
||||||
|
ast_test_validate(test, ast_utf8_is_valid("aaa \xce\xbb aaa"));
|
||||||
|
ast_test_validate(test, ast_utf8_is_valid("aaa \xe2\x8a\x9b aaa"));
|
||||||
|
ast_test_validate(test, ast_utf8_is_valid("aaa \xf0\x9f\x93\x9e aaa"));
|
||||||
|
|
||||||
|
/* Valid if limited by number of bytes */
|
||||||
|
ast_test_validate(test, ast_utf8_is_validn("Asterisk" "\xff", strlen("Asterisk")));
|
||||||
|
ast_test_validate(test, ast_utf8_is_validn("\xce\xbb" "\xff", strlen("\xce\xbb")));
|
||||||
|
ast_test_validate(test, ast_utf8_is_validn("\xe2\x8a\x9b" "\xff", strlen("\xe2\x8a\x9b")));
|
||||||
|
ast_test_validate(test, ast_utf8_is_validn("\xf0\x9f\x93\x9e" "\xff", strlen("\xf0\x9f\x93\x9e")));
|
||||||
|
|
||||||
|
/* Invalid */
|
||||||
|
ast_test_validate(test, !ast_utf8_is_valid("\xc0\x8a")); /* Overlong */
|
||||||
|
ast_test_validate(test, !ast_utf8_is_valid("98.6\xa7")); /* 'High ASCII' */
|
||||||
|
ast_test_validate(test, !ast_utf8_is_valid("\xc3\x28"));
|
||||||
|
ast_test_validate(test, !ast_utf8_is_valid("\xa0\xa1"));
|
||||||
|
ast_test_validate(test, !ast_utf8_is_valid("\xe2\x28\xa1"));
|
||||||
|
ast_test_validate(test, !ast_utf8_is_valid("\xe2\x82\x28"));
|
||||||
|
ast_test_validate(test, !ast_utf8_is_valid("\xf0\x28\x8c\xbc"));
|
||||||
|
ast_test_validate(test, !ast_utf8_is_valid("\xf0\x90\x28\xbc"));
|
||||||
|
ast_test_validate(test, !ast_utf8_is_valid("\xf0\x28\x8c\x28"));
|
||||||
|
|
||||||
|
return AST_TEST_PASS;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int test_copy_and_compare(const char *src, size_t dst_len, const char *cmp)
|
||||||
|
{
|
||||||
|
char dst[dst_len];
|
||||||
|
ast_utf8_copy_string(dst, src, dst_len);
|
||||||
|
return strcmp(dst, cmp) == 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
AST_TEST_DEFINE(test_utf8_copy_string)
|
||||||
|
{
|
||||||
|
switch (cmd) {
|
||||||
|
case TEST_INIT:
|
||||||
|
info->name = "copy_string";
|
||||||
|
info->category = "/main/utf8/";
|
||||||
|
info->summary = "Test ast_utf8_copy_string";
|
||||||
|
info->description =
|
||||||
|
"Tests UTF-8 string copying code.";
|
||||||
|
return AST_TEST_NOT_RUN;
|
||||||
|
case TEST_EXECUTE:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
ast_test_validate(test, test_copy_and_compare("Asterisk", 6, "Aster"));
|
||||||
|
ast_test_validate(test, test_copy_and_compare("Asterisk \xc2\xae", 11, "Asterisk "));
|
||||||
|
ast_test_validate(test, test_copy_and_compare("Asterisk \xc2\xae", 12, "Asterisk \xc2\xae"));
|
||||||
|
ast_test_validate(test, test_copy_and_compare("Asterisk \xc0\x8a", 12, "Asterisk "));
|
||||||
|
ast_test_validate(test, test_copy_and_compare("\xce\xbb xyz", 1, ""));
|
||||||
|
ast_test_validate(test, test_copy_and_compare("\xce\xbb xyz", 2, ""));
|
||||||
|
ast_test_validate(test, test_copy_and_compare("\xce\xbb xyz", 3, "\xce\xbb"));
|
||||||
|
ast_test_validate(test, test_copy_and_compare("\xce\xbb xyz", 4, "\xce\xbb "));
|
||||||
|
ast_test_validate(test, test_copy_and_compare("\xce\xbb xyz", 5, "\xce\xbb x"));
|
||||||
|
ast_test_validate(test, test_copy_and_compare("\xce\xbb xyz", 6, "\xce\xbb xy"));
|
||||||
|
ast_test_validate(test, test_copy_and_compare("\xce\xbb xyz", 7, "\xce\xbb xyz"));
|
||||||
|
|
||||||
|
return AST_TEST_PASS;
|
||||||
|
}
|
||||||
|
|
||||||
|
AST_TEST_DEFINE(test_utf8_validator)
|
||||||
|
{
|
||||||
|
struct ast_utf8_validator *validator;
|
||||||
|
|
||||||
|
switch (cmd) {
|
||||||
|
case TEST_INIT:
|
||||||
|
info->name = "utf8_validator";
|
||||||
|
info->category = "/main/utf8/";
|
||||||
|
info->summary = "Test ast_utf8_validator";
|
||||||
|
info->description =
|
||||||
|
"Tests UTF-8 progressive validator code.";
|
||||||
|
return AST_TEST_NOT_RUN;
|
||||||
|
case TEST_EXECUTE:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ast_utf8_validator_new(&validator)) {
|
||||||
|
return AST_TEST_FAIL;
|
||||||
|
}
|
||||||
|
|
||||||
|
ast_test_validate(test, ast_utf8_validator_feed(validator, "Asterisk") == AST_UTF8_VALID);
|
||||||
|
ast_test_validate(test, ast_utf8_validator_feed(validator, "\xc2") == AST_UTF8_UNKNOWN);
|
||||||
|
ast_test_validate(test, ast_utf8_validator_feed(validator, "\xae") == AST_UTF8_VALID);
|
||||||
|
ast_test_validate(test, ast_utf8_validator_feed(validator, "Private") == AST_UTF8_VALID);
|
||||||
|
ast_test_validate(test, ast_utf8_validator_feed(validator, "Branch") == AST_UTF8_VALID);
|
||||||
|
ast_test_validate(test, ast_utf8_validator_feed(validator, "Exchange") == AST_UTF8_VALID);
|
||||||
|
ast_test_validate(test, ast_utf8_validator_feed(validator, "\xe2") == AST_UTF8_UNKNOWN);
|
||||||
|
ast_test_validate(test, ast_utf8_validator_feed(validator, "\x84") == AST_UTF8_UNKNOWN);
|
||||||
|
ast_test_validate(test, ast_utf8_validator_feed(validator, "\xbb") == AST_UTF8_VALID);
|
||||||
|
ast_test_validate(test, ast_utf8_validator_feed(validator, "\xc0\x8a") == AST_UTF8_INVALID);
|
||||||
|
ast_test_validate(test, ast_utf8_validator_feed(validator, "valid") == AST_UTF8_INVALID);
|
||||||
|
ast_test_validate(test, ast_utf8_validator_feed(validator, "valid") == AST_UTF8_INVALID);
|
||||||
|
ast_test_validate(test, ast_utf8_validator_feed(validator, "valid") == AST_UTF8_INVALID);
|
||||||
|
|
||||||
|
ast_utf8_validator_destroy(validator);
|
||||||
|
|
||||||
|
return AST_TEST_PASS;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void test_utf8_shutdown(void)
|
||||||
|
{
|
||||||
|
AST_TEST_UNREGISTER(test_utf8_is_valid);
|
||||||
|
AST_TEST_UNREGISTER(test_utf8_copy_string);
|
||||||
|
AST_TEST_UNREGISTER(test_utf8_validator);
|
||||||
|
}
|
||||||
|
|
||||||
|
int ast_utf8_init(void)
|
||||||
|
{
|
||||||
|
AST_TEST_REGISTER(test_utf8_is_valid);
|
||||||
|
AST_TEST_REGISTER(test_utf8_copy_string);
|
||||||
|
AST_TEST_REGISTER(test_utf8_validator);
|
||||||
|
|
||||||
|
ast_register_cleanup(test_utf8_shutdown);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
#else /* !TEST_FRAMEWORK */
|
||||||
|
|
||||||
|
int ast_utf8_init(void)
|
||||||
|
{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
Reference in New Issue
Block a user