1 /* Determine a canonical name for the current locale's character encoding. 2 3 Copyright (C) 2000-2006 Free Software Foundation, Inc. 4 5 This program is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 2, or (at your option) 8 any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 You should have received a copy of the GNU General Public License along 16 with this program; if not, write to the Free Software Foundation, 17 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ 18 19 /* Written by Bruno Haible <bruno@clisp.org>. */ 20 21 #include <config.h> 22 23 /* Specification. */ 24 #include "localcharset.h" 25 26 #include <stddef.h> 27 #include <stdio.h> 28 #include <string.h> 29 #include <stdlib.h> 30 31 #if defined _WIN32 || defined __WIN32__ 32 # define WIN32_NATIVE 33 #endif 34 35 #if defined __EMX__ 36 /* Assume EMX program runs on OS/2, even if compiled under DOS. */ 37 # define OS2 38 #endif 39 40 #if !defined WIN32_NATIVE 41 # if HAVE_LANGINFO_CODESET 42 # include <langinfo.h> 43 # else 44 # if 0 /* see comment below */ 45 # include <locale.h> 46 # endif 47 # endif 48 # ifdef __CYGWIN__ 49 # define WIN32_LEAN_AND_MEAN 50 # include <windows.h> 51 # endif 52 #elif defined WIN32_NATIVE 53 # define WIN32_LEAN_AND_MEAN 54 # include <windows.h> 55 #endif 56 #if defined OS2 57 # define INCL_DOS 58 # include <os2.h> 59 #endif 60 61 #if ENABLE_RELOCATABLE 62 # include "relocatable.h" 63 #else 64 # define relocate(pathname) (pathname) 65 #endif 66 67 /* Get LIBDIR. */ 68 #ifndef LIBDIR 69 # include "configmake.h" 70 #endif 71 72 #if defined _WIN32 || defined __WIN32__ || defined __CYGWIN__ || defined __EMX__ || defined __DJGPP__ 73 /* Win32, Cygwin, OS/2, DOS */ 74 # define ISSLASH(C) ((C) == '/' || (C) == '\\') 75 #endif 76 77 #ifndef DIRECTORY_SEPARATOR 78 # define DIRECTORY_SEPARATOR '/' 79 #endif 80 81 #ifndef ISSLASH 82 # define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR) 83 #endif 84 85 #if HAVE_DECL_GETC_UNLOCKED 86 # undef getc 87 # define getc getc_unlocked 88 #endif 89 90 /* The following static variable is declared 'volatile' to avoid a 91 possible multithread problem in the function get_charset_aliases. If we 92 are running in a threaded environment, and if two threads initialize 93 'charset_aliases' simultaneously, both will produce the same value, 94 and everything will be ok if the two assignments to 'charset_aliases' 95 are atomic. But I don't know what will happen if the two assignments mix. */ 96 #if __STDC__ != 1 97 # define volatile /* empty */ 98 #endif 99 /* Pointer to the contents of the charset.alias file, if it has already been 100 read, else NULL. Its format is: 101 ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0' */ 102 static const char * volatile charset_aliases; 103 104 /* Return a pointer to the contents of the charset.alias file. */ 105 static const char * 106 get_charset_aliases (void) 107 { 108 const char *cp; 109 110 cp = charset_aliases; 111 if (cp == NULL) 112 { 113 #if !(defined VMS || defined WIN32_NATIVE || defined __CYGWIN__) 114 FILE *fp; 115 const char *dir; 116 const char *base = "charset.alias"; 117 char *file_name; 118 119 /* Make it possible to override the charset.alias location. This is 120 necessary for running the testsuite before "make install". */ 121 dir = getenv ("CHARSETALIASDIR"); 122 if (dir == NULL || dir[0] == '\0') 123 dir = relocate (LIBDIR); 124 125 /* Concatenate dir and base into freshly allocated file_name. */ 126 { 127 size_t dir_len = strlen (dir); 128 size_t base_len = strlen (base); 129 int add_slash = (dir_len > 0 && !ISSLASH (dir[dir_len - 1])); 130 file_name = (char *) malloc (dir_len + add_slash + base_len + 1); 131 if (file_name != NULL) 132 { 133 memcpy (file_name, dir, dir_len); 134 if (add_slash) 135 file_name[dir_len] = DIRECTORY_SEPARATOR; 136 memcpy (file_name + dir_len + add_slash, base, base_len + 1); 137 } 138 } 139 140 if (file_name == NULL || (fp = fopen (file_name, "r")) == NULL) 141 /* Out of memory or file not found, treat it as empty. */ 142 cp = ""; 143 else 144 { 145 /* Parse the file's contents. */ 146 char *res_ptr = NULL; 147 size_t res_size = 0; 148 149 for (;;) 150 { 151 int c; 152 char buf1[50+1]; 153 char buf2[50+1]; 154 size_t l1, l2; 155 char *old_res_ptr; 156 157 c = getc (fp); 158 if (c == EOF) 159 break; 160 if (c == '\n' || c == ' ' || c == '\t') 161 continue; 162 if (c == '#') 163 { 164 /* Skip comment, to end of line. */ 165 do 166 c = getc (fp); 167 while (!(c == EOF || c == '\n')); 168 if (c == EOF) 169 break; 170 continue; 171 } 172 ungetc (c, fp); 173 if (fscanf (fp, "%50s %50s", buf1, buf2) < 2) 174 break; 175 l1 = strlen (buf1); 176 l2 = strlen (buf2); 177 old_res_ptr = res_ptr; 178 if (res_size == 0) 179 { 180 res_size = l1 + 1 + l2 + 1; 181 res_ptr = (char *) malloc (res_size + 1); 182 } 183 else 184 { 185 res_size += l1 + 1 + l2 + 1; 186 res_ptr = (char *) realloc (res_ptr, res_size + 1); 187 } 188 if (res_ptr == NULL) 189 { 190 /* Out of memory. */ 191 res_size = 0; 192 if (old_res_ptr != NULL) 193 free (old_res_ptr); 194 break; 195 } 196 strcpy (res_ptr + res_size - (l2 + 1) - (l1 + 1), buf1); 197 strcpy (res_ptr + res_size - (l2 + 1), buf2); 198 } 199 fclose (fp); 200 if (res_size == 0) 201 cp = ""; 202 else 203 { 204 *(res_ptr + res_size) = '\0'; 205 cp = res_ptr; 206 } 207 } 208 209 if (file_name != NULL) 210 free (file_name); 211 212 #else 213 214 # if defined VMS 215 /* To avoid the troubles of an extra file charset.alias_vms in the 216 sources of many GNU packages, simply inline the aliases here. */ 217 /* The list of encodings is taken from the OpenVMS 7.3-1 documentation 218 "Compaq C Run-Time Library Reference Manual for OpenVMS systems" 219 section 10.7 "Handling Different Character Sets". */ 220 cp = "ISO8859-1" "\0" "ISO-8859-1" "\0" 221 "ISO8859-2" "\0" "ISO-8859-2" "\0" 222 "ISO8859-5" "\0" "ISO-8859-5" "\0" 223 "ISO8859-7" "\0" "ISO-8859-7" "\0" 224 "ISO8859-8" "\0" "ISO-8859-8" "\0" 225 "ISO8859-9" "\0" "ISO-8859-9" "\0" 226 /* Japanese */ 227 "eucJP" "\0" "EUC-JP" "\0" 228 "SJIS" "\0" "SHIFT_JIS" "\0" 229 "DECKANJI" "\0" "DEC-KANJI" "\0" 230 "SDECKANJI" "\0" "EUC-JP" "\0" 231 /* Chinese */ 232 "eucTW" "\0" "EUC-TW" "\0" 233 "DECHANYU" "\0" "DEC-HANYU" "\0" 234 "DECHANZI" "\0" "GB2312" "\0" 235 /* Korean */ 236 "DECKOREAN" "\0" "EUC-KR" "\0"; 237 # endif 238 239 # if defined WIN32_NATIVE || defined __CYGWIN__ 240 /* To avoid the troubles of installing a separate file in the same 241 directory as the DLL and of retrieving the DLL's directory at 242 runtime, simply inline the aliases here. */ 243 244 cp = "CP936" "\0" "GBK" "\0" 245 "CP1361" "\0" "JOHAB" "\0" 246 "CP20127" "\0" "ASCII" "\0" 247 "CP20866" "\0" "KOI8-R" "\0" 248 "CP20936" "\0" "GB2312" "\0" 249 "CP21866" "\0" "KOI8-RU" "\0" 250 "CP28591" "\0" "ISO-8859-1" "\0" 251 "CP28592" "\0" "ISO-8859-2" "\0" 252 "CP28593" "\0" "ISO-8859-3" "\0" 253 "CP28594" "\0" "ISO-8859-4" "\0" 254 "CP28595" "\0" "ISO-8859-5" "\0" 255 "CP28596" "\0" "ISO-8859-6" "\0" 256 "CP28597" "\0" "ISO-8859-7" "\0" 257 "CP28598" "\0" "ISO-8859-8" "\0" 258 "CP28599" "\0" "ISO-8859-9" "\0" 259 "CP28605" "\0" "ISO-8859-15" "\0" 260 "CP38598" "\0" "ISO-8859-8" "\0" 261 "CP51932" "\0" "EUC-JP" "\0" 262 "CP51936" "\0" "GB2312" "\0" 263 "CP51949" "\0" "EUC-KR" "\0" 264 "CP51950" "\0" "EUC-TW" "\0" 265 "CP54936" "\0" "GB18030" "\0" 266 "CP65001" "\0" "UTF-8" "\0"; 267 # endif 268 #endif 269 270 charset_aliases = cp; 271 } 272 273 return cp; 274 } 275 276 /* Determine the current locale's character encoding, and canonicalize it 277 into one of the canonical names listed in config.charset. 278 The result must not be freed; it is statically allocated. 279 If the canonical name cannot be determined, the result is a non-canonical 280 name. */ 281 282 #ifdef STATIC 283 STATIC 284 #endif 285 const char * 286 locale_charset (void) 287 { 288 const char *codeset; 289 const char *aliases; 290 291 #if !(defined WIN32_NATIVE || defined OS2) 292 293 # if HAVE_LANGINFO_CODESET 294 295 /* Most systems support nl_langinfo (CODESET) nowadays. */ 296 codeset = nl_langinfo (CODESET); 297 298 # ifdef __CYGWIN__ 299 /* Cygwin 2006 does not have locales. nl_langinfo (CODESET) always 300 returns "US-ASCII". As long as this is not fixed, return the suffix 301 of the locale name from the environment variables (if present) or 302 the codepage as a number. */ 303 if (codeset != NULL && strcmp (codeset, "US-ASCII") == 0) 304 { 305 const char *locale; 306 static char buf[2 + 10 + 1]; 307 308 locale = getenv ("LC_ALL"); 309 if (locale == NULL || locale[0] == '\0') 310 { 311 locale = getenv ("LC_CTYPE"); 312 if (locale == NULL || locale[0] == '\0') 313 locale = getenv ("LANG"); 314 } 315 if (locale != NULL && locale[0] != '\0') 316 { 317 /* If the locale name contains an encoding after the dot, return 318 it. */ 319 const char *dot = strchr (locale, '.'); 320 321 if (dot != NULL) 322 { 323 const char *modifier; 324 325 dot++; 326 /* Look for the possible @... trailer and remove it, if any. */ 327 modifier = strchr (dot, '@'); 328 if (modifier == NULL) 329 return dot; 330 if (modifier - dot < sizeof (buf)) 331 { 332 memcpy (buf, dot, modifier - dot); 333 buf [modifier - dot] = '\0'; 334 return buf; 335 } 336 } 337 } 338 339 /* Woe32 has a function returning the locale's codepage as a number. */ 340 sprintf (buf, "CP%u", GetACP ()); 341 codeset = buf; 342 } 343 # endif 344 345 # else 346 347 /* On old systems which lack it, use setlocale or getenv. */ 348 const char *locale = NULL; 349 350 /* But most old systems don't have a complete set of locales. Some 351 (like SunOS 4 or DJGPP) have only the C locale. Therefore we don't 352 use setlocale here; it would return "C" when it doesn't support the 353 locale name the user has set. */ 354 # if 0 355 locale = setlocale (LC_CTYPE, NULL); 356 # endif 357 if (locale == NULL || locale[0] == '\0') 358 { 359 locale = getenv ("LC_ALL"); 360 if (locale == NULL || locale[0] == '\0') 361 { 362 locale = getenv ("LC_CTYPE"); 363 if (locale == NULL || locale[0] == '\0') 364 locale = getenv ("LANG"); 365 } 366 } 367 368 /* On some old systems, one used to set locale = "iso8859_1". On others, 369 you set it to "language_COUNTRY.charset". In any case, we resolve it 370 through the charset.alias file. */ 371 codeset = locale; 372 373 # endif 374 375 #elif defined WIN32_NATIVE 376 377 static char buf[2 + 10 + 1]; 378 379 /* Woe32 has a function returning the locale's codepage as a number. */ 380 sprintf (buf, "CP%u", GetACP ()); 381 codeset = buf; 382 383 #elif defined OS2 384 385 const char *locale; 386 static char buf[2 + 10 + 1]; 387 ULONG cp[3]; 388 ULONG cplen; 389 390 /* Allow user to override the codeset, as set in the operating system, 391 with standard language environment variables. */ 392 locale = getenv ("LC_ALL"); 393 if (locale == NULL || locale[0] == '\0') 394 { 395 locale = getenv ("LC_CTYPE"); 396 if (locale == NULL || locale[0] == '\0') 397 locale = getenv ("LANG"); 398 } 399 if (locale != NULL && locale[0] != '\0') 400 { 401 /* If the locale name contains an encoding after the dot, return it. */ 402 const char *dot = strchr (locale, '.'); 403 404 if (dot != NULL) 405 { 406 const char *modifier; 407 408 dot++; 409 /* Look for the possible @... trailer and remove it, if any. */ 410 modifier = strchr (dot, '@'); 411 if (modifier == NULL) 412 return dot; 413 if (modifier - dot < sizeof (buf)) 414 { 415 memcpy (buf, dot, modifier - dot); 416 buf [modifier - dot] = '\0'; 417 return buf; 418 } 419 } 420 421 /* Resolve through the charset.alias file. */ 422 codeset = locale; 423 } 424 else 425 { 426 /* OS/2 has a function returning the locale's codepage as a number. */ 427 if (DosQueryCp (sizeof (cp), cp, &cplen)) 428 codeset = ""; 429 else 430 { 431 sprintf (buf, "CP%u", cp[0]); 432 codeset = buf; 433 } 434 } 435 436 #endif 437 438 if (codeset == NULL) 439 /* The canonical name cannot be determined. */ 440 codeset = ""; 441 442 /* Resolve alias. */ 443 for (aliases = get_charset_aliases (); 444 *aliases != '\0'; 445 aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1) 446 if (strcmp (codeset, aliases) == 0 447 || (aliases[0] == '*' && aliases[1] == '\0')) 448 { 449 codeset = aliases + strlen (aliases) + 1; 450 break; 451 } 452 453 /* Don't return an empty string. GNU libc and GNU libiconv interpret 454 the empty string as denoting "the locale's character encoding", 455 thus GNU libiconv would call this function a second time. */ 456 if (codeset[0] == '\0') 457 codeset = "ASCII"; 458 459 return codeset; 460 }