res_pjsip: Replace invalid UTF-8 sequences in callerid name

* Added a new function ast_utf8_replace_invalid_chars() to utf8.c that copies a string replacing any invalid UTF-8 sequences with the Unicode specified U+FFFD replacement character. For example: "abc\xffdef" becomes "abc\uFFFDdef". Any UTF-8 compliant implementation will show that character as a � character. * Updated res_pjsip:set_id_from_hdr() to use ast_utf8_replace_invalid_chars and print a warning if any invalid sequences were found during the copy. * Updated stasis_channels:ast_channel_publish_varset to use ast_utf8_replace_invalid_chars and print a warning if any invalid sequences were found during the copy. ASTERISK-27830 Change-Id: I4ffbdb19c80bf0efc675d40078a3ca4f85c567d8
2025-09-02 19:16:15 +00:00 · 2023-02-16 09:05:30 -07:00
parent e5c5cd6e25
commit ceda5a9859
4 changed files with 647 additions and 3 deletions
--- a/include/asterisk/utf8.h
+++ b/include/asterisk/utf8.h
@@ -67,6 +67,59 @@ int ast_utf8_is_validn(const char *str, size_t size);
 */
 void ast_utf8_copy_string(char *dst, const char *src, size_t size);

+enum ast_utf8_replace_result {
+	/*! \brief Source contained fully valid UTF-8
+	 *
+	 * The entire string was valid UTF-8 and no replacement
+	 * was required.
+	 */
+	AST_UTF8_REPLACE_VALID,
+
+	/*! \brief Source contained at least 1 invalid UTF-8 sequence
+	 *
+	 * Parts of the string contained invalid UTF-8 sequences
+	 * but those were successfully replaced with the U+FFFD
+	 * replacement sequence.
+	 */
+	AST_UTF8_REPLACE_INVALID,
+
+	/*! \brief Not enough space to copy entire source
+	 *
+	 * The destination buffer wasn't large enough to copy
+	 * all of the source characters.  As many of the source
+	 * characters that could be copied/replaced were done so
+	 * and a final NULL terminator added.
+	 */
+	AST_UTF8_REPLACE_OVERRUN,
+};
+
+/*!
+ * \brief Copy a string safely replacing any invalid UTF-8 sequences
+ *
+ * This is similar to \ref ast_copy_string, but it will only copy valid UTF-8
+ * sequences from the source string into the destination buffer.
+ * If an invalid sequence is encountered, it's replaced with the \uFFFD
+ * sequence which is the valid UTF-8 sequence that represents an unknown,
+ * unrecognized, or unrepresentable character.  Since \uFFFD is actually a
+ * 3 byte sequence, the destination buffer will need to be larger than
+ * the corresponding source string if it contains invalid sequences.
+ * You can pass NULL as the destination buffer pointer to get the actual
+ * size required, then call the function again with the properly sized
+ * buffer.
+ *
+ * \param dst       Pointer to the destination buffer. If NULL,
+ *                  dst_size will be set to the size of the
+ *                  buffer required to fully process the
+ *                  source string.
+ * \param dst_size  A pointer to the size of the dst buffer
+ * \param src       The source string
+ * \param src_len   The number of bytes to copy
+ *
+ * \return \ref ast_utf8_replace_result
+ */
+enum ast_utf8_replace_result ast_utf8_replace_invalid_chars(char *dst,
+	size_t *dst_size, const char *src, size_t src_len);
+
 enum ast_utf8_validation_result {
 	/*! \brief The consumed sequence is valid UTF-8
 	 *