VLC  3.0.21
vlc_charset.h
Go to the documentation of this file.
1 /*****************************************************************************
2  * vlc_charset.h: Unicode UTF-8 wrappers function
3  *****************************************************************************
4  * Copyright (C) 2003-2005 VLC authors and VideoLAN
5  * Copyright © 2005-2010 Rémi Denis-Courmont
6  * $Id$
7  *
8  * Author: Rémi Denis-Courmont
9  *
10  * This program is free software; you can redistribute it and/or modify it
11  * under the terms of the GNU Lesser General Public License as published by
12  * the Free Software Foundation; either version 2.1 of the License, or
13  * (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18  * GNU Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public License
21  * along with this program; if not, write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
23  *****************************************************************************/
24 
25 #ifndef VLC_CHARSET_H
26 #define VLC_CHARSET_H 1
27 
28 /**
29  * \file
30  * Characters sets handling
31  *
32  * \ingroup strings
33  * @{
34  */
35 
36 /**
37  * Decodes a code point from UTF-8.
38  *
39  * Converts the first character in a UTF-8 sequence into a Unicode code point.
40  *
41  * \param str an UTF-8 bytes sequence [IN]
42  * \param pwc address of a location to store the code point [OUT]
43  *
44  * \return the number of bytes occupied by the decoded code point
45  *
46  * \retval (size_t)-1 not a valid UTF-8 sequence
47  * \retval 0 null character (i.e. str points to an empty string)
48  * \retval 1 (non-null) ASCII character
49  * \retval 2-4 non-ASCII character
50  */
51 VLC_API size_t vlc_towc(const char *str, uint32_t *restrict pwc);
52 
53 /**
54  * Checks UTF-8 validity.
55  *
56  * Checks whether a null-terminated string is a valid UTF-8 bytes sequence.
57  *
58  * \param str string to check
59  *
60  * \retval str the string is a valid null-terminated UTF-8 sequence
61  * \retval NULL the string is not an UTF-8 sequence
62  */
63 VLC_USED static inline const char *IsUTF8(const char *str)
64 {
65  size_t n;
66  uint32_t cp;
67 
68  while ((n = vlc_towc(str, &cp)) != 0)
69  if (likely(n != (size_t)-1))
70  str += n;
71  else
72  return NULL;
73  return str;
74 }
75 
76 /**
77  * Removes non-UTF-8 sequences.
78  *
79  * Replaces invalid or <i>over-long</i> UTF-8 bytes sequences within a
80  * null-terminated string with question marks. This is so that the string can
81  * be printed at least partially.
82  *
83  * \warning Do not use this were correctness is critical. use IsUTF8() and
84  * handle the error case instead. This function is mainly for display or debug.
85  *
86  * \note Converting from Latin-1 to UTF-8 in place is not possible (the string
87  * size would be increased). So it is not attempted even if it would otherwise
88  * be less disruptive.
89  *
90  * \retval str the string is a valid null-terminated UTF-8 sequence
91  * (i.e. no changes were made)
92  * \retval NULL the string is not an UTF-8 sequence
93  */
94 static inline char *EnsureUTF8(char *str)
95 {
96  char *ret = str;
97  size_t n;
98  uint32_t cp;
99 
100  while ((n = vlc_towc(str, &cp)) != 0)
101  if (likely(n != (size_t)-1))
102  str += n;
103  else
104  {
105  *str++ = '?';
106  ret = NULL;
107  }
108  return ret;
109 }
110 
111 /* iconv wrappers (defined in src/extras/libc.c) */
112 #define VLC_ICONV_ERR ((size_t) -1)
113 typedef void *vlc_iconv_t;
114 VLC_API vlc_iconv_t vlc_iconv_open( const char *, const char * ) VLC_USED;
115 VLC_API size_t vlc_iconv( vlc_iconv_t, const char **, size_t *, char **, size_t * ) VLC_USED;
117 
118 #include <stdarg.h>
119 
120 VLC_API int utf8_vfprintf( FILE *stream, const char *fmt, va_list ap );
121 VLC_API int utf8_fprintf( FILE *, const char *, ... ) VLC_FORMAT( 2, 3 );
122 VLC_API char * vlc_strcasestr(const char *, const char *) VLC_USED;
123 
124 VLC_API char * FromCharset( const char *charset, const void *data, size_t data_size ) VLC_USED;
125 VLC_API void * ToCharset( const char *charset, const char *in, size_t *outsize ) VLC_USED;
126 
127 #ifdef __APPLE__
128 # include <CoreFoundation/CFString.h>
129 
130 /* Obtains a copy of the contents of a CFString in specified encoding.
131  * Returns char* (must be freed by caller) or NULL on failure.
132  */
133 VLC_USED static inline char *FromCFString(const CFStringRef cfString,
134  const CFStringEncoding cfStringEncoding)
135 {
136  // Try the quick way to obtain the buffer
137  const char *tmpBuffer = CFStringGetCStringPtr(cfString, cfStringEncoding);
138 
139  if (tmpBuffer != NULL) {
140  return strdup(tmpBuffer);
141  }
142 
143  // The quick way did not work, try the long way
144  CFIndex length = CFStringGetLength(cfString);
145  CFIndex maxSize =
146  CFStringGetMaximumSizeForEncoding(length, cfStringEncoding);
147 
148  // If result would exceed LONG_MAX, kCFNotFound is returned
149  if (unlikely(maxSize == kCFNotFound)) {
150  return NULL;
151  }
152 
153  // Account for the null terminator
154  maxSize++;
155 
156  char *buffer = (char *)malloc(maxSize);
157 
158  if (unlikely(buffer == NULL)) {
159  return NULL;
160  }
161 
162  // Copy CFString in requested encoding to buffer
163  Boolean success = CFStringGetCString(cfString, buffer, maxSize, cfStringEncoding);
164 
165  if (!success)
166  FREENULL(buffer);
167  return buffer;
168 }
169 #endif
170 
171 #ifdef _WIN32
172 VLC_USED
173 static inline char *FromWide (const wchar_t *wide)
174 {
175  size_t len = WideCharToMultiByte (CP_UTF8, 0, wide, -1, NULL, 0, NULL, NULL);
176  if (len == 0)
177  return NULL;
178 
179  char *out = (char *)malloc (len);
180 
181  if (likely(out))
182  WideCharToMultiByte (CP_UTF8, 0, wide, -1, out, len, NULL, NULL);
183  return out;
184 }
185 
186 VLC_USED
187 static inline wchar_t *ToWide (const char *utf8)
188 {
189  int len = MultiByteToWideChar (CP_UTF8, 0, utf8, -1, NULL, 0);
190  if (len == 0)
191  return NULL;
192 
193  wchar_t *out = (wchar_t *)malloc (len * sizeof (wchar_t));
194 
195  if (likely(out))
196  MultiByteToWideChar (CP_UTF8, 0, utf8, -1, out, len);
197  return out;
198 }
199 
201 static inline char *ToCodePage (unsigned cp, const char *utf8)
202 {
203  wchar_t *wide = ToWide (utf8);
204  if (wide == NULL)
205  return NULL;
206 
207  size_t len = WideCharToMultiByte (cp, 0, wide, -1, NULL, 0, NULL, NULL);
208  if (len == 0) {
209  free(wide);
210  return NULL;
211  }
212 
213  char *out = (char *)malloc (len);
214  if (likely(out != NULL))
215  WideCharToMultiByte (cp, 0, wide, -1, out, len, NULL, NULL);
216  free (wide);
217  return out;
218 }
219 
221 static inline char *FromCodePage (unsigned cp, const char *mb)
222 {
223  int len = MultiByteToWideChar (cp, 0, mb, -1, NULL, 0);
224  if (len == 0)
225  return NULL;
226 
227  wchar_t *wide = (wchar_t *)malloc (len * sizeof (wchar_t));
228  if (unlikely(wide == NULL))
229  return NULL;
230  MultiByteToWideChar (cp, 0, mb, -1, wide, len);
231 
232  char *utf8 = FromWide (wide);
233  free (wide);
234  return utf8;
235 }
236 
238 static inline char *FromANSI (const char *ansi)
239 {
240  return FromCodePage (GetACP (), ansi);
241 }
242 
244 static inline char *ToANSI (const char *utf8)
245 {
246  return ToCodePage (GetACP (), utf8);
247 }
248 
249 # ifdef UNICODE
250 # define FromT FromWide
251 # define ToT ToWide
252 # else
253 # define FromT FromANSI
254 # define ToT ToANSI
255 # endif
256 # define FromLocale FromANSI
257 # define ToLocale ToANSI
258 # define LocaleFree(s) free((char *)(s))
259 # define FromLocaleDup FromANSI
260 # define ToLocaleDup ToANSI
261 
262 #elif defined(__OS2__)
263 
264 VLC_USED static inline char *FromLocale (const char *locale)
265 {
266  return locale ? FromCharset ((char *)"", locale, strlen(locale)) : NULL;
267 }
268 
269 VLC_USED static inline char *ToLocale (const char *utf8)
270 {
271  size_t outsize;
272  return utf8 ? (char *)ToCharset ("", utf8, &outsize) : NULL;
273 }
274 
275 VLC_USED static inline void LocaleFree (const char *str)
276 {
277  free ((char *)str);
278 }
279 
280 VLC_USED static inline char *FromLocaleDup (const char *locale)
281 {
282  return FromCharset ("", locale, strlen(locale));
283 }
284 
285 VLC_USED static inline char *ToLocaleDup (const char *utf8)
286 {
287  size_t outsize;
288  return (char *)ToCharset ("", utf8, &outsize);
289 }
290 
291 #else
292 
293 # define FromLocale(l) (l)
294 # define ToLocale(u) (u)
295 # define LocaleFree(s) ((void)(s))
296 # define FromLocaleDup strdup
297 # define ToLocaleDup strdup
298 #endif
299 
300 /**
301  * Converts a nul-terminated string from ISO-8859-1 to UTF-8.
302  */
303 static inline char *FromLatin1 (const char *latin)
304 {
305  char *str = (char *)malloc (2 * strlen (latin) + 1), *utf8 = str;
306  unsigned char c;
307 
308  if (str == NULL)
309  return NULL;
310 
311  while ((c = *(latin++)) != '\0')
312  {
313  if (c >= 0x80)
314  {
315  *(utf8++) = 0xC0 | (c >> 6);
316  *(utf8++) = 0x80 | (c & 0x3F);
317  }
318  else
319  *(utf8++) = c;
320  }
321  *(utf8++) = '\0';
322 
323  utf8 = (char *)realloc (str, utf8 - str);
324  return utf8 ? utf8 : str;
325 }
326 
327 /** @} */
328 
329 VLC_API double us_strtod( const char *, char ** ) VLC_USED;
330 VLC_API float us_strtof( const char *, char ** ) VLC_USED;
331 VLC_API double us_atof( const char * ) VLC_USED;
332 VLC_API int us_vasprintf( char **, const char *, va_list );
333 VLC_API int us_asprintf( char **, const char *, ... ) VLC_USED;
334 
335 #endif
LocaleFree
#define LocaleFree(s)
Definition: vlc_charset.h:295
VLC_FORMAT
#define VLC_FORMAT(x, y)
Definition: vlc_common.h:100
VLC_API
#define VLC_API
Definition: fourcc_gen.c:30
vlc_iconv
size_t vlc_iconv(vlc_iconv_t, const char **, size_t *, char **, size_t *)
us_atof
double us_atof(const char *)
us_atof() has the same prototype as ANSI C atof() but it expects a dot as decimal separator,...
Definition: charset.c:87
vlc_common.h
FromLocaleDup
#define FromLocaleDup
Definition: vlc_charset.h:296
vlc_strcasestr
char * vlc_strcasestr(const char *, const char *)
Look for an UTF-8 string within another one in a case-insensitive fashion.
Definition: unicode.c:196
us_vasprintf
int us_vasprintf(char **, const char *, va_list)
us_vasprintf() has the same prototype as vasprintf(), but doesn't use the system locale.
Definition: charset.c:97
FromLocale
#define FromLocale(l)
Definition: vlc_charset.h:293
us_asprintf
int us_asprintf(char **, const char *,...)
us_asprintf() has the same prototype as asprintf(), but doesn't use the system locale.
Definition: charset.c:118
FromCharset
char * FromCharset(const char *charset, const void *data, size_t data_size)
Converts a string from the given character encoding to utf-8.
Definition: unicode.c:235
vlc_iconv_t
void * vlc_iconv_t
Definition: vlc_charset.h:113
FromLatin1
static char * FromLatin1(const char *latin)
Converts a nul-terminated string from ISO-8859-1 to UTF-8.
Definition: vlc_charset.h:303
us_strtof
float us_strtof(const char *, char **)
us_strtof() has the same prototype as ANSI C strtof() but it uses the POSIX/C decimal format,...
Definition: charset.c:68
ToCharset
void * ToCharset(const char *charset, const char *in, size_t *outsize)
Converts a nul-terminated UTF-8 string to a given character encoding.
Definition: unicode.c:277
ToLocaleDup
#define ToLocaleDup
Definition: vlc_charset.h:297
vlc_iconv_open
vlc_iconv_t vlc_iconv_open(const char *, const char *)
utf8_vfprintf
int utf8_vfprintf(FILE *stream, const char *fmt, va_list ap)
Formats an UTF-8 string as vfprintf(), then print it, with appropriate conversion to local encoding.
Definition: unicode.c:50
utf8_fprintf
int utf8_fprintf(FILE *, const char *,...)
Formats an UTF-8 string as fprintf(), then print it, with appropriate conversion to local encoding.
Definition: unicode.c:100
us_strtod
double us_strtod(const char *, char **)
us_strtod() has the same prototype as ANSI C strtod() but it uses the POSIX/C decimal format,...
Definition: charset.c:49
vlc_towc
size_t vlc_towc(const char *str, uint32_t *restrict pwc)
Decodes a code point from UTF-8.
Definition: unicode.c:111
likely
#define likely(p)
Definition: vlc_common.h:113
EnsureUTF8
static char * EnsureUTF8(char *str)
Removes non-UTF-8 sequences.
Definition: vlc_charset.h:94
FREENULL
#define FREENULL(a)
Definition: vlc_common.h:770
strdup
char * strdup(const char *)
VLC_USED
#define VLC_USED
Definition: fourcc_gen.c:31
vlc_iconv_close
int vlc_iconv_close(vlc_iconv_t)
VLC_MALLOC
#define VLC_MALLOC
Definition: vlc_common.h:102
unlikely
#define unlikely(p)
Definition: vlc_common.h:114
IsUTF8
static const char * IsUTF8(const char *str)
Checks UTF-8 validity.
Definition: vlc_charset.h:63
ToLocale
#define ToLocale(u)
Definition: vlc_charset.h:294