VLC 4.0.0-dev
Loading...
Searching...
No Matches
vlc_charset.h
Go to the documentation of this file.
1/*****************************************************************************
2 * vlc_charset.h: Unicode UTF-8 wrappers function
3 *****************************************************************************
4 * Copyright (C) 2003-2005 VLC authors and VideoLAN
5 * Copyright © 2005-2010 Rémi Denis-Courmont
6 *
7 * Author: Rémi Denis-Courmont
8 *
9 * This program is free software; you can redistribute it and/or modify it
10 * under the terms of the GNU Lesser General Public License as published by
11 * the Free Software Foundation; either version 2.1 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public License
20 * along with this program; if not, write to the Free Software Foundation,
21 * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
22 *****************************************************************************/
23
24#ifndef VLC_CHARSET_H
25#define VLC_CHARSET_H 1
26
27/**
28 * \file vlc_charset.h
29 * \ingroup charset
30 * \defgroup charset Character sets
31 * \ingroup strings
32 * @{
33 */
34
35/**
36 * Decodes a code point from UTF-8.
37 *
38 * Converts the first character in a UTF-8 sequence into a Unicode code point.
39 *
40 * \param str an UTF-8 bytes sequence [IN]
41 * \param pwc address of a location to store the code point [OUT]
42 *
43 * \return the number of bytes occupied by the decoded code point
44 *
45 * \retval -1 not a valid UTF-8 sequence
46 * \retval 0 null character (i.e. str points to an empty string)
47 * \retval 1 (non-null) ASCII character
48 * \retval 2-4 non-ASCII character
49 */
50VLC_API ssize_t vlc_towc(const char *str, uint32_t *restrict pwc);
51
52/**
53 * Checks UTF-8 validity.
54 *
55 * Checks whether a null-terminated string is a valid UTF-8 bytes sequence.
56 *
57 * \param str string to check
58 *
59 * \retval str the string is a valid null-terminated UTF-8 sequence
60 * \retval NULL the string is not an UTF-8 sequence
61 */
62VLC_USED static inline const char *IsUTF8(const char *str)
64 ssize_t n;
65 uint32_t cp;
66
67 while ((n = vlc_towc(str, &cp)) != 0)
68 if (likely(n != -1))
69 str += n;
70 else
71 return NULL;
72 return str;
73}
74
75/**
76 * Checks ASCII validity.
77 *
78 * Checks whether a null-terminated string is a valid ASCII bytes sequence
79 * (non-printable ASCII characters 1-31 are permitted).
80 *
81 * \param str string to check
82 *
83 * \retval str the string is a valid null-terminated ASCII sequence
84 * \retval NULL the string is not an ASCII sequence
85 */
86VLC_USED static inline const char *IsASCII(const char *str)
88 unsigned char c;
89
90 for (const char *p = str; (c = *p) != '\0'; p++)
91 if (c >= 0x80)
92 return NULL;
93 return str;
94}
95
96/**
97 * Removes non-UTF-8 sequences.
98 *
99 * Replaces invalid or <i>over-long</i> UTF-8 bytes sequences within a
100 * null-terminated string with question marks. This is so that the string can
101 * be printed at least partially.
102 *
103 * \warning Do not use this were correctness is critical. use IsUTF8() and
104 * handle the error case instead. This function is mainly for display or debug.
105 *
106 * \note Converting from Latin-1 to UTF-8 in place is not possible (the string
107 * size would be increased). So it is not attempted even if it would otherwise
108 * be less disruptive.
109 *
110 * \retval str the string is a valid null-terminated UTF-8 sequence
111 * (i.e. no changes were made)
112 * \retval NULL the string is not an UTF-8 sequence
113 */
114static inline char *EnsureUTF8(char *str)
116 char *ret = str;
117 ssize_t n;
118 uint32_t cp;
119
120 while ((n = vlc_towc(str, &cp)) != 0)
121 if (likely(n != -1))
122 str += n;
123 else
124 {
125 *str++ = '?';
126 ret = NULL;
127 }
128 return ret;
129}
130
131/**
132 * \defgroup iconv iconv wrappers
133 *
134 * (defined in src/extras/libc.c)
135 * @{
136 */
137
138#define VLC_ICONV_ERR ((size_t) -1)
139typedef void *vlc_iconv_t;
140VLC_API vlc_iconv_t vlc_iconv_open( const char *, const char * ) VLC_USED;
141VLC_API size_t vlc_iconv( vlc_iconv_t, const char **, size_t *, char **, size_t * ) VLC_USED;
144/** @} */
145
146#include <stdarg.h>
147
148VLC_API int utf8_vfprintf( FILE *stream, const char *fmt, va_list ap );
149VLC_API int utf8_fprintf( FILE *, const char *, ... ) VLC_FORMAT( 2, 3 );
150VLC_API char * vlc_strcasestr(const char *, const char *) VLC_USED;
151
152VLC_API char * FromCharset( const char *charset, const void *data, size_t data_size ) VLC_USED;
153VLC_API void * ToCharset( const char *charset, const char *in, size_t *outsize ) VLC_USED;
154
155#ifdef __APPLE__
156# include <CoreFoundation/CoreFoundation.h>
157
158/* Obtains a copy of the contents of a CFString in specified encoding.
159 * Returns char* (must be freed by caller) or NULL on failure.
160 */
161VLC_USED static inline char *FromCFString(const CFStringRef cfString,
162 const CFStringEncoding cfStringEncoding)
163{
164 // Try the quick way to obtain the buffer
165 const char *tmpBuffer = CFStringGetCStringPtr(cfString, cfStringEncoding);
166
167 if (tmpBuffer != NULL) {
168 return strdup(tmpBuffer);
169 }
170
171 // The quick way did not work, try the long way
172 CFIndex length = CFStringGetLength(cfString);
173 CFIndex maxSize =
174 CFStringGetMaximumSizeForEncoding(length, cfStringEncoding);
175
176 // If result would exceed LONG_MAX, kCFNotFound is returned
177 if (unlikely(maxSize == kCFNotFound)) {
178 return NULL;
179 }
180
181 // Account for the null terminator
182 maxSize++;
183
184 char *buffer = (char *)malloc(maxSize);
185
186 if (unlikely(buffer == NULL)) {
187 return NULL;
188 }
189
190 // Copy CFString in requested encoding to buffer
191 Boolean success = CFStringGetCString(cfString, buffer, maxSize, cfStringEncoding);
192
193 if (!success)
194 FREENULL(buffer);
195 return buffer;
196}
197#endif
198
199#ifdef _WIN32
200# include <windows.h>
201
203static inline char *FromWide (const wchar_t *wide)
204{
205 size_t len = WideCharToMultiByte (CP_UTF8, 0, wide, -1, NULL, 0, NULL, NULL);
206 if (len == 0)
207 return NULL;
208
209 char *out = (char *)malloc (len);
210
211 if (likely(out))
212 WideCharToMultiByte (CP_UTF8, 0, wide, -1, out, len, NULL, NULL);
213 return out;
214}
215
217static inline wchar_t *ToWide (const char *utf8)
218{
219 int len = MultiByteToWideChar (CP_UTF8, 0, utf8, -1, NULL, 0);
220 if (len == 0)
221 return NULL;
222
223 wchar_t *out = (wchar_t *)malloc (len * sizeof (wchar_t));
224
225 if (likely(out))
226 MultiByteToWideChar (CP_UTF8, 0, utf8, -1, out, len);
227 return out;
228}
229
231static inline char *ToCodePage (unsigned cp, const char *utf8)
232{
233 wchar_t *wide = ToWide (utf8);
234 if (wide == NULL)
235 return NULL;
236
237 size_t len = WideCharToMultiByte (cp, 0, wide, -1, NULL, 0, NULL, NULL);
238 if (len == 0) {
239 free(wide);
240 return NULL;
241 }
242
243 char *out = (char *)malloc (len);
244 if (likely(out != NULL))
245 WideCharToMultiByte (cp, 0, wide, -1, out, len, NULL, NULL);
246 free (wide);
247 return out;
248}
249
251static inline char *FromCodePage (unsigned cp, const char *mb)
252{
253 int len = MultiByteToWideChar (cp, 0, mb, -1, NULL, 0);
254 if (len == 0)
255 return NULL;
256
257 wchar_t *wide = (wchar_t *)malloc (len * sizeof (wchar_t));
258 if (unlikely(wide == NULL))
259 return NULL;
260 MultiByteToWideChar (cp, 0, mb, -1, wide, len);
261
262 char *utf8 = FromWide (wide);
263 free (wide);
264 return utf8;
265}
266
268static inline char *FromANSI (const char *ansi)
269{
270 return FromCodePage (GetACP (), ansi);
271}
272
274static inline char *ToANSI (const char *utf8)
275{
276 return ToCodePage (GetACP (), utf8);
277}
278
279# define FromLocale FromANSI
280# define ToLocale ToANSI
281# define LocaleFree(s) free((char *)(s))
282# define FromLocaleDup FromANSI
283# define ToLocaleDup ToANSI
284
285#elif defined(__OS2__)
286
287VLC_USED static inline char *FromLocale (const char *locale)
288{
289 return locale ? FromCharset ((char *)"", locale, strlen(locale)) : NULL;
290}
291
292VLC_USED static inline char *ToLocale (const char *utf8)
293{
294 size_t outsize;
295 return utf8 ? (char *)ToCharset ("", utf8, &outsize) : NULL;
296}
297
298VLC_USED static inline void LocaleFree (const char *str)
299{
300 free ((char *)str);
301}
302
303VLC_USED static inline char *FromLocaleDup (const char *locale)
304{
305 return FromCharset ("", locale, strlen(locale));
306}
307
308VLC_USED static inline char *ToLocaleDup (const char *utf8)
309{
310 size_t outsize;
311 return (char *)ToCharset ("", utf8, &outsize);
312}
313
314#else
315
316# define FromLocale(l) (l)
317# define ToLocale(u) (u)
318# define LocaleFree(s) ((void)(s))
319# define FromLocaleDup strdup
320# define ToLocaleDup strdup
321#endif
322
323/**
324 * Converts a nul-terminated string from ISO-8859-1 to UTF-8.
325 */
326static inline char *FromLatin1 (const char *latin)
328 char *str = (char *)malloc (2 * strlen (latin) + 1), *utf8 = str;
329 unsigned char c;
330
331 if (str == NULL)
332 return NULL;
333
334 while ((c = *(latin++)) != '\0')
335 {
336 if (c >= 0x80)
337 {
338 *(utf8++) = 0xC0 | (c >> 6);
339 *(utf8++) = 0x80 | (c & 0x3F);
340 }
341 else
342 *(utf8++) = c;
343 }
344 *(utf8++) = '\0';
345
346 utf8 = (char *)realloc (str, utf8 - str);
347 return utf8 ? utf8 : str;
348}
349
350/**
351 * \defgroup c_locale C/POSIX locale functions
352 * @{
353 */
354
355/**
356 * Parses a double in C locale.
357 *
358 * This function parses a double-precision floating point number from a string
359 * just like the standard strtod() but it uses the C locale. In other words, it
360 * expects the POSIX/C/American decimal format regardless of the current
361 * numeric locale.
362 *
363 * \param str nul-terminated string to parse
364 * \param[out] end storage space for a pointer to the first unparsed byte
365 * (or NULL to discard it)
366 * \return the parsed double value (zero if no character could be parsed)
367 */
368VLC_API double vlc_strtod_c(const char *restrict str, char **restrict end)
370
371/**
372 * Parses a float in C locale.
373 *
374 * This function parses a single-precision floating point number from a string
375 * just like the standard strtof() but it uses the C locale. In other words, it
376 * expects the POSIX/C/American decimal format regardless of the current
377 * numeric locale.
378 *
379 * \param str nul-terminated string to parse
380 * \param[out] end storage space for a pointer to the first unparsed byte
381 * (or NULL to discard it)
382 * \return the parsed double value (zero if no character could be parsed)
383 */
384VLC_API float vlc_strtof_c(const char *restrict str, char **restrict end)
386
387/**
388 * Parses a double in C locale.
389 *
390 * This function parses a double-precision floating point number from a string
391 * just like the standard atof() but it uses the C locale. In other words, it
392 * expects the POSIX/C/American decimal format regardless of the current
393 * numeric locale.
394 *
395 * \param str nul-terminated string to parse
396 * \return the parsed double value (zero if no character could be parsed)
397 */
398VLC_USED static inline double vlc_atof_c(const char *str)
400 return vlc_strtod_c(str, NULL);
401}
402
403/**
404 * Formats a string using the C locale.
405 *
406 * This function formats a string from a format string and a variable argument
407 * list, just like the standard vasprintf() but using the C locale for the
408 * formatting of numerals.
409 *
410 * \param[out] p storage space for a pointer to the heap-allocated formatted
411 * string (undefined on error)
412 * \param fmt format string
413 * \param ap variable argument list
414 * \return number of bytes formatted (excluding the nul terminator)
415 * or -1 on error
416 */
417VLC_API int vlc_vasprintf_c(char **restrict p, const char *restrict fmt,
418 va_list ap) VLC_USED;
419
420/**
421 * Formats a string using the C locale.
422 *
423 * This function formats a string from a format string and a variable argument
424 * list, just like the standard asprintf() but using the C locale for the
425 * formatting of numerals.
426 *
427 * \param[out] p storage space for a pointer to the heap-allocated formatted
428 * string (undefined on error)
429 * \param fmt format string
430 * \return number of bytes formatted (excluding the nul terminator)
431 * or -1 on error
432 */
433VLC_API int vlc_asprintf_c( char **p, const char *fmt, ... ) VLC_USED;
435/**
436 * Write a string to the output using the C locale
437 *
438 * This function formats a string from a format string and a variable argument
439 * list, just like the standard vfprintf() but using the C locale for the
440 * formatting of numerals.
441 *
442 * \param f output stream to write the string to
443 * \param fmt format string
444 * \param ap variable argument list
445 * \return number of bytes formatted (excluding the nul terminator)
446 * or -1 on error
447 */
448VLC_API int vlc_vfprintf_c(FILE *f, const char *fmt, va_list ap);
449
450/**
451 * Write a string to the output using the C locale
452 *
453 * This function formats a string from a format string and a variable argument
454 * list, just like the standard fprintf() but using the C locale for the
455 * formatting of numerals.
456 *
457 * \param f output stream to write the string to
458 * \param fmt format string
459 * \return number of bytes formatted (excluding the nul terminator)
460 * or -1 on error
461 */
462VLC_API int vlc_fprintf_c(FILE *f, const char *fmt, ...);
464int vlc_vsscanf_c(const char *, const char *, va_list) VLC_USED;
465int vlc_sscanf_c(const char*, const char*, ...) VLC_USED
466#ifdef __GNUC__
467__attribute__((format(scanf, 2, 3)))
468#endif
469;
470
471/** @} */
472/** @} */
473
474#endif
#define VLC_USED
Definition fourcc_gen.c:32
#define VLC_API
Definition fourcc_gen.c:31
#define p(t)
double vlc_strtod_c(const char *restrict str, char **restrict end)
Parses a double in C locale.
Definition charset.c:46
static double vlc_atof_c(const char *str)
Parses a double in C locale.
Definition vlc_charset.h:399
int vlc_sscanf_c(const char *, const char *,...)
int vlc_asprintf_c(char **p, const char *fmt,...)
Formats a string using the C locale.
int vlc_fprintf_c(FILE *f, const char *fmt,...)
Write a string to the output using the C locale.
int vlc_vasprintf_c(char **restrict p, const char *restrict fmt, va_list ap)
Formats a string using the C locale.
Definition charset.c:74
float vlc_strtof_c(const char *restrict str, char **restrict end)
Parses a float in C locale.
Definition charset.c:60
int vlc_vfprintf_c(FILE *f, const char *fmt, va_list ap)
Write a string to the output using the C locale.
Definition charset.c:103
int vlc_vsscanf_c(const char *, const char *, va_list)
#define unlikely(p)
Predicted false condition.
Definition vlc_common.h:246
#define VLC_MALLOC
Definition vlc_common.h:157
#define likely(p)
Predicted true condition.
Definition vlc_common.h:237
#define VLC_FORMAT(x, y)
String format function annotation.
Definition vlc_common.h:193
char * vlc_strcasestr(const char *, const char *)
Look for an UTF-8 string within another one in a case-insensitive fashion.
Definition unicode.c:191
void * ToCharset(const char *charset, const char *in, size_t *outsize)
Converts a nul-terminated UTF-8 string to a given character encoding.
Definition unicode.c:274
static char * EnsureUTF8(char *str)
Removes non-UTF-8 sequences.
Definition vlc_charset.h:115
char * FromCharset(const char *charset, const void *data, size_t data_size)
Converts a string from the given character encoding to utf-8.
Definition unicode.c:232
static const char * IsUTF8(const char *str)
Checks UTF-8 validity.
Definition vlc_charset.h:63
#define ToLocaleDup
Definition vlc_charset.h:321
#define ToLocale(u)
Definition vlc_charset.h:318
#define LocaleFree(s)
Definition vlc_charset.h:319
ssize_t vlc_towc(const char *str, uint32_t *restrict pwc)
Decodes a code point from UTF-8.
Definition unicode.c:115
static char * FromLatin1(const char *latin)
Converts a nul-terminated string from ISO-8859-1 to UTF-8.
Definition vlc_charset.h:327
int utf8_vfprintf(FILE *stream, const char *fmt, va_list ap)
Formats an UTF-8 string as vfprintf(), then print it, with appropriate conversion to local encoding.
Definition unicode.c:52
#define FromLocaleDup
Definition vlc_charset.h:320
int utf8_fprintf(FILE *, const char *,...)
Formats an UTF-8 string as fprintf(), then print it, with appropriate conversion to local encoding.
Definition unicode.c:104
static const char * IsASCII(const char *str)
Checks ASCII validity.
Definition vlc_charset.h:87
#define FromLocale(l)
Definition vlc_charset.h:317
void * vlc_iconv_t
Definition vlc_charset.h:140
size_t vlc_iconv(vlc_iconv_t, const char **, size_t *, char **, size_t *)
int vlc_iconv_close(vlc_iconv_t)
vlc_iconv_t vlc_iconv_open(const char *, const char *)
This file is a collection of common definitions and types.
#define FREENULL(a)
Definition vlc_common.h:901
char * strdup(const char *)