VLC  4.0.0-dev
vlc_charset.h
Go to the documentation of this file.
1 /*****************************************************************************
2  * vlc_charset.h: Unicode UTF-8 wrappers function
3  *****************************************************************************
4  * Copyright (C) 2003-2005 VLC authors and VideoLAN
5  * Copyright © 2005-2010 Rémi Denis-Courmont
6  *
7  * Author: Rémi Denis-Courmont
8  *
9  * This program is free software; you can redistribute it and/or modify it
10  * under the terms of the GNU Lesser General Public License as published by
11  * the Free Software Foundation; either version 2.1 of the License, or
12  * (at your option) any later version.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  * GNU Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public License
20  * along with this program; if not, write to the Free Software Foundation,
21  * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
22  *****************************************************************************/
23 
24 #ifndef VLC_CHARSET_H
25 #define VLC_CHARSET_H 1
26 
27 /**
28  * \file vlc_charset.h
29  * \ingroup charset
30  * \defgroup charset Character sets
31  * \ingroup strings
32  * @{
33  */
34 
35 /**
36  * Decodes a code point from UTF-8.
37  *
38  * Converts the first character in a UTF-8 sequence into a Unicode code point.
39  *
40  * \param str an UTF-8 bytes sequence [IN]
41  * \param pwc address of a location to store the code point [OUT]
42  *
43  * \return the number of bytes occupied by the decoded code point
44  *
45  * \retval (size_t)-1 not a valid UTF-8 sequence
46  * \retval 0 null character (i.e. str points to an empty string)
47  * \retval 1 (non-null) ASCII character
48  * \retval 2-4 non-ASCII character
49  */
50 VLC_API size_t vlc_towc(const char *str, uint32_t *restrict pwc);
51 
52 /**
53  * Checks UTF-8 validity.
54  *
55  * Checks whether a null-terminated string is a valid UTF-8 bytes sequence.
56  *
57  * \param str string to check
58  *
59  * \retval str the string is a valid null-terminated UTF-8 sequence
60  * \retval NULL the string is not an UTF-8 sequence
61  */
62 VLC_USED static inline const char *IsUTF8(const char *str)
63 {
64  size_t n;
65  uint32_t cp;
66 
67  while ((n = vlc_towc(str, &cp)) != 0)
68  if (likely(n != (size_t)-1))
69  str += n;
70  else
71  return NULL;
72  return str;
73 }
74 
75 /**
76  * Checks ASCII validity.
77  *
78  * Checks whether a null-terminated string is a valid ASCII bytes sequence
79  * (non-printable ASCII characters 1-31 are permitted).
80  *
81  * \param str string to check
82  *
83  * \retval str the string is a valid null-terminated ASCII sequence
84  * \retval NULL the string is not an ASCII sequence
85  */
86 VLC_USED static inline const char *IsASCII(const char *str)
87 {
88  unsigned char c;
89 
90  for (const char *p = str; (c = *p) != '\0'; p++)
91  if (c >= 0x80)
92  return NULL;
93  return str;
94 }
95 
96 /**
97  * Removes non-UTF-8 sequences.
98  *
99  * Replaces invalid or <i>over-long</i> UTF-8 bytes sequences within a
100  * null-terminated string with question marks. This is so that the string can
101  * be printed at least partially.
102  *
103  * \warning Do not use this were correctness is critical. use IsUTF8() and
104  * handle the error case instead. This function is mainly for display or debug.
105  *
106  * \note Converting from Latin-1 to UTF-8 in place is not possible (the string
107  * size would be increased). So it is not attempted even if it would otherwise
108  * be less disruptive.
109  *
110  * \retval str the string is a valid null-terminated UTF-8 sequence
111  * (i.e. no changes were made)
112  * \retval NULL the string is not an UTF-8 sequence
113  */
114 static inline char *EnsureUTF8(char *str)
115 {
116  char *ret = str;
117  size_t n;
118  uint32_t cp;
119 
120  while ((n = vlc_towc(str, &cp)) != 0)
121  if (likely(n != (size_t)-1))
122  str += n;
123  else
124  {
125  *str++ = '?';
126  ret = NULL;
127  }
128  return ret;
129 }
130 
131 /**
132  * \defgroup iconv iconv wrappers
133  *
134  * (defined in src/extras/libc.c)
135  * @{
136  */
137 
138 #define VLC_ICONV_ERR ((size_t) -1)
139 typedef void *vlc_iconv_t;
140 VLC_API vlc_iconv_t vlc_iconv_open( const char *, const char * ) VLC_USED;
141 VLC_API size_t vlc_iconv( vlc_iconv_t, const char **, size_t *, char **, size_t * ) VLC_USED;
143 
144 /** @} */
145 
146 #include <stdarg.h>
147 
148 VLC_API int utf8_vfprintf( FILE *stream, const char *fmt, va_list ap );
149 VLC_API int utf8_fprintf( FILE *, const char *, ... ) VLC_FORMAT( 2, 3 );
150 VLC_API char * vlc_strcasestr(const char *, const char *) VLC_USED;
151 
152 VLC_API char * FromCharset( const char *charset, const void *data, size_t data_size ) VLC_USED;
153 VLC_API void * ToCharset( const char *charset, const char *in, size_t *outsize ) VLC_USED;
154 
155 #ifdef __APPLE__
156 # include <CoreFoundation/CoreFoundation.h>
157 
158 /* Obtains a copy of the contents of a CFString in specified encoding.
159  * Returns char* (must be freed by caller) or NULL on failure.
160  */
161 VLC_USED static inline char *FromCFString(const CFStringRef cfString,
162  const CFStringEncoding cfStringEncoding)
163 {
164  // Try the quick way to obtain the buffer
165  const char *tmpBuffer = CFStringGetCStringPtr(cfString, cfStringEncoding);
166 
167  if (tmpBuffer != NULL) {
168  return strdup(tmpBuffer);
169  }
170 
171  // The quick way did not work, try the long way
172  CFIndex length = CFStringGetLength(cfString);
173  CFIndex maxSize =
174  CFStringGetMaximumSizeForEncoding(length, cfStringEncoding);
175 
176  // If result would exceed LONG_MAX, kCFNotFound is returned
177  if (unlikely(maxSize == kCFNotFound)) {
178  return NULL;
179  }
180 
181  // Account for the null terminator
182  maxSize++;
183 
184  char *buffer = (char *)malloc(maxSize);
185 
186  if (unlikely(buffer == NULL)) {
187  return NULL;
188  }
189 
190  // Copy CFString in requested encoding to buffer
191  Boolean success = CFStringGetCString(cfString, buffer, maxSize, cfStringEncoding);
192 
193  if (!success)
194  FREENULL(buffer);
195  return buffer;
196 }
197 #endif
198 
199 #ifdef _WIN32
200 VLC_USED
201 static inline char *FromWide (const wchar_t *wide)
202 {
203  size_t len = WideCharToMultiByte (CP_UTF8, 0, wide, -1, NULL, 0, NULL, NULL);
204  if (len == 0)
205  return NULL;
206 
207  char *out = (char *)malloc (len);
208 
209  if (likely(out))
210  WideCharToMultiByte (CP_UTF8, 0, wide, -1, out, len, NULL, NULL);
211  return out;
212 }
213 
214 VLC_USED
215 static inline wchar_t *ToWide (const char *utf8)
216 {
217  int len = MultiByteToWideChar (CP_UTF8, 0, utf8, -1, NULL, 0);
218  if (len == 0)
219  return NULL;
220 
221  wchar_t *out = (wchar_t *)malloc (len * sizeof (wchar_t));
222 
223  if (likely(out))
224  MultiByteToWideChar (CP_UTF8, 0, utf8, -1, out, len);
225  return out;
226 }
227 
229 static inline char *ToCodePage (unsigned cp, const char *utf8)
230 {
231  wchar_t *wide = ToWide (utf8);
232  if (wide == NULL)
233  return NULL;
234 
235  size_t len = WideCharToMultiByte (cp, 0, wide, -1, NULL, 0, NULL, NULL);
236  if (len == 0) {
237  free(wide);
238  return NULL;
239  }
240 
241  char *out = (char *)malloc (len);
242  if (likely(out != NULL))
243  WideCharToMultiByte (cp, 0, wide, -1, out, len, NULL, NULL);
244  free (wide);
245  return out;
246 }
247 
249 static inline char *FromCodePage (unsigned cp, const char *mb)
250 {
251  int len = MultiByteToWideChar (cp, 0, mb, -1, NULL, 0);
252  if (len == 0)
253  return NULL;
254 
255  wchar_t *wide = (wchar_t *)malloc (len * sizeof (wchar_t));
256  if (unlikely(wide == NULL))
257  return NULL;
258  MultiByteToWideChar (cp, 0, mb, -1, wide, len);
259 
260  char *utf8 = FromWide (wide);
261  free (wide);
262  return utf8;
263 }
264 
266 static inline char *FromANSI (const char *ansi)
267 {
268  return FromCodePage (GetACP (), ansi);
269 }
270 
272 static inline char *ToANSI (const char *utf8)
273 {
274  return ToCodePage (GetACP (), utf8);
275 }
276 
277 # define FromLocale FromANSI
278 # define ToLocale ToANSI
279 # define LocaleFree(s) free((char *)(s))
280 # define FromLocaleDup FromANSI
281 # define ToLocaleDup ToANSI
282 
283 #elif defined(__OS2__)
284 
285 VLC_USED static inline char *FromLocale (const char *locale)
286 {
287  return locale ? FromCharset ((char *)"", locale, strlen(locale)) : NULL;
288 }
289 
290 VLC_USED static inline char *ToLocale (const char *utf8)
291 {
292  size_t outsize;
293  return utf8 ? (char *)ToCharset ("", utf8, &outsize) : NULL;
294 }
295 
296 VLC_USED static inline void LocaleFree (const char *str)
297 {
298  free ((char *)str);
299 }
300 
301 VLC_USED static inline char *FromLocaleDup (const char *locale)
302 {
303  return FromCharset ("", locale, strlen(locale));
304 }
305 
306 VLC_USED static inline char *ToLocaleDup (const char *utf8)
307 {
308  size_t outsize;
309  return (char *)ToCharset ("", utf8, &outsize);
310 }
311 
312 #else
313 
314 # define FromLocale(l) (l)
315 # define ToLocale(u) (u)
316 # define LocaleFree(s) ((void)(s))
317 # define FromLocaleDup strdup
318 # define ToLocaleDup strdup
319 #endif
320 
321 /**
322  * Converts a nul-terminated string from ISO-8859-1 to UTF-8.
323  */
324 static inline char *FromLatin1 (const char *latin)
325 {
326  char *str = (char *)malloc (2 * strlen (latin) + 1), *utf8 = str;
327  unsigned char c;
328 
329  if (str == NULL)
330  return NULL;
331 
332  while ((c = *(latin++)) != '\0')
333  {
334  if (c >= 0x80)
335  {
336  *(utf8++) = 0xC0 | (c >> 6);
337  *(utf8++) = 0x80 | (c & 0x3F);
338  }
339  else
340  *(utf8++) = c;
341  }
342  *(utf8++) = '\0';
343 
344  utf8 = (char *)realloc (str, utf8 - str);
345  return utf8 ? utf8 : str;
346 }
347 
348 /**
349  * \defgroup c_locale C/POSIX locale functions
350  * @{
351  */
352 VLC_API double us_strtod( const char *, char ** ) VLC_USED;
353 VLC_API float us_strtof( const char *, char ** ) VLC_USED;
354 VLC_API double us_atof( const char * ) VLC_USED;
355 VLC_API int us_vasprintf( char **, const char *, va_list );
356 VLC_API int us_asprintf( char **, const char *, ... ) VLC_USED;
357 /** @} */
358 /** @} */
359 
360 #endif
vlc_iconv_t
void * vlc_iconv_t
Definition: vlc_charset.h:140
vlc_towc
VLC_EXPORT size_t vlc_towc(const char *str, uint32_t *restrict pwc)
Decodes a code point from UTF-8.
Definition: unicode.c:113
VLC_API
#define VLC_API
Definition: fourcc_gen.c:31
us_asprintf
VLC_EXPORT int us_asprintf(char **, const char *,...)
us_asprintf() has the same prototype as asprintf(), but doesn't use the system locale.
Definition: charset.c:119
us_strtod
VLC_EXPORT double us_strtod(const char *, char **)
us_strtod() has the same prototype as ANSI C strtod() but it uses the POSIX/C decimal format,...
Definition: charset.c:50
unlikely
#define unlikely(p)
Predicted false condition.
Definition: vlc_common.h:227
vlc_common.h
IsASCII
static const char * IsASCII(const char *str)
Checks ASCII validity.
Definition: vlc_charset.h:87
vlc_iconv_close
VLC_EXPORT int vlc_iconv_close(vlc_iconv_t)
FromLatin1
static char * FromLatin1(const char *latin)
Converts a nul-terminated string from ISO-8859-1 to UTF-8.
Definition: vlc_charset.h:325
VLC_MALLOC
#define VLC_MALLOC
Heap allocated result function annotation.
Definition: vlc_common.h:167
vlc_strcasestr
VLC_EXPORT char * vlc_strcasestr(const char *, const char *)
Look for an UTF-8 string within another one in a case-insensitive fashion.
Definition: unicode.c:198
FromLocaleDup
#define FromLocaleDup
Definition: vlc_charset.h:318
EnsureUTF8
static char * EnsureUTF8(char *str)
Removes non-UTF-8 sequences.
Definition: vlc_charset.h:115
LocaleFree
#define LocaleFree(s)
Definition: vlc_charset.h:317
vlc_iconv
VLC_EXPORT size_t vlc_iconv(vlc_iconv_t, const char **, size_t *, char **, size_t *)
FromLocale
#define FromLocale(l)
Definition: vlc_charset.h:315
us_atof
VLC_EXPORT double us_atof(const char *)
us_atof() has the same prototype as ANSI C atof() but it expects a dot as decimal separator,...
Definition: charset.c:88
us_vasprintf
VLC_EXPORT int us_vasprintf(char **, const char *, va_list)
us_vasprintf() has the same prototype as vasprintf(), but doesn't use the system locale.
Definition: charset.c:98
vlc_iconv_open
VLC_EXPORT vlc_iconv_t vlc_iconv_open(const char *, const char *)
FromCharset
VLC_EXPORT char * FromCharset(const char *charset, const void *data, size_t data_size)
Converts a string from the given character encoding to utf-8.
Definition: unicode.c:237
VLC_FORMAT
#define VLC_FORMAT(x, y)
String format function annotation.
Definition: vlc_common.h:141
ToLocaleDup
#define ToLocaleDup
Definition: vlc_charset.h:319
FREENULL
#define FREENULL(a)
Definition: vlc_common.h:961
strdup
char * strdup(const char *)
VLC_USED
#define VLC_USED
Definition: fourcc_gen.c:32
IsUTF8
static const char * IsUTF8(const char *str)
Checks UTF-8 validity.
Definition: vlc_charset.h:63
ToCharset
VLC_EXPORT void * ToCharset(const char *charset, const char *in, size_t *outsize)
Converts a nul-terminated UTF-8 string to a given character encoding.
Definition: unicode.c:279
ToLocale
#define ToLocale(u)
Definition: vlc_charset.h:316
utf8_vfprintf
VLC_EXPORT int utf8_vfprintf(FILE *stream, const char *fmt, va_list ap)
Formats an UTF-8 string as vfprintf(), then print it, with appropriate conversion to local encoding.
Definition: unicode.c:52
likely
#define likely(p)
Predicted true condition.
Definition: vlc_common.h:218
utf8_fprintf
VLC_EXPORT int utf8_fprintf(FILE *, const char *,...)
Formats an UTF-8 string as fprintf(), then print it, with appropriate conversion to local encoding.
Definition: unicode.c:102
us_strtof
VLC_EXPORT float us_strtof(const char *, char **)
us_strtof() has the same prototype as ANSI C strtof() but it uses the POSIX/C decimal format,...
Definition: charset.c:69
p
#define p(t)