res_pjsip: Replace invalid UTF-8 sequences in callerid name

* Added a new function ast_utf8_replace_invalid_chars() to
  utf8.c that copies a string replacing any invalid UTF-8
  sequences with the Unicode specified U+FFFD replacement
  character.  For example:  "abc\xffdef" becomes "abc\uFFFDdef".
  Any UTF-8 compliant implementation will show that character
  as a � character.

* Updated res_pjsip:set_id_from_hdr() to use
  ast_utf8_replace_invalid_chars and print a warning if any
  invalid sequences were found during the copy.

* Updated stasis_channels:ast_channel_publish_varset to use
  ast_utf8_replace_invalid_chars and print a warning if any
  invalid sequences were found during the copy.

ASTERISK-27830

Change-Id: I4ffbdb19c80bf0efc675d40078a3ca4f85c567d8
This commit is contained in:
George Joseph
2023-02-16 09:05:30 -07:00
committed by George Joseph
parent e5c5cd6e25
commit ceda5a9859
4 changed files with 647 additions and 3 deletions

View File

@@ -67,6 +67,59 @@ int ast_utf8_is_validn(const char *str, size_t size);
*/
void ast_utf8_copy_string(char *dst, const char *src, size_t size);
enum ast_utf8_replace_result {
/*! \brief Source contained fully valid UTF-8
*
* The entire string was valid UTF-8 and no replacement
* was required.
*/
AST_UTF8_REPLACE_VALID,
/*! \brief Source contained at least 1 invalid UTF-8 sequence
*
* Parts of the string contained invalid UTF-8 sequences
* but those were successfully replaced with the U+FFFD
* replacement sequence.
*/
AST_UTF8_REPLACE_INVALID,
/*! \brief Not enough space to copy entire source
*
* The destination buffer wasn't large enough to copy
* all of the source characters. As many of the source
* characters that could be copied/replaced were done so
* and a final NULL terminator added.
*/
AST_UTF8_REPLACE_OVERRUN,
};
/*!
* \brief Copy a string safely replacing any invalid UTF-8 sequences
*
* This is similar to \ref ast_copy_string, but it will only copy valid UTF-8
* sequences from the source string into the destination buffer.
* If an invalid sequence is encountered, it's replaced with the \uFFFD
* sequence which is the valid UTF-8 sequence that represents an unknown,
* unrecognized, or unrepresentable character. Since \uFFFD is actually a
* 3 byte sequence, the destination buffer will need to be larger than
* the corresponding source string if it contains invalid sequences.
* You can pass NULL as the destination buffer pointer to get the actual
* size required, then call the function again with the properly sized
* buffer.
*
* \param dst Pointer to the destination buffer. If NULL,
* dst_size will be set to the size of the
* buffer required to fully process the
* source string.
* \param dst_size A pointer to the size of the dst buffer
* \param src The source string
* \param src_len The number of bytes to copy
*
* \return \ref ast_utf8_replace_result
*/
enum ast_utf8_replace_result ast_utf8_replace_invalid_chars(char *dst,
size_t *dst_size, const char *src, size_t src_len);
enum ast_utf8_validation_result {
/*! \brief The consumed sequence is valid UTF-8
*