Conversion between different character encodings. More...
#include "config.h"
#include <errno.h>
#include <iconv.h>
#include <langinfo.h>
#include <limits.h>
#include <stdbool.h>
#include <stdio.h>
#include <string.h>
#include "charset.h"
#include "buffer.h"
#include "list.h"
#include "logging2.h"
#include "memory.h"
#include "pool.h"
#include "queue.h"
#include "regex3.h"
#include "slist.h"
#include "string2.h"
#include <libintl.h>
Go to the source code of this file.
Data Structures | |
struct | Lookup |
Regex to String lookup table. More... | |
struct | IconvCacheEntry |
Cached iconv conversion descriptor. More... | |
struct | MimeNames |
MIME name lookup entry. More... | |
Macros | |
#define | EILSEQ EINVAL |
#define | ICONV_CACHE_SIZE 16 |
Max size of the iconv cache. | |
Functions | |
TAILQ_HEAD (LookupList, Lookup) | |
static struct Lookup * | lookup_new (void) |
Create a new Lookup. | |
static void | lookup_free (struct Lookup **ptr) |
Free a Lookup. | |
static const char * | lookup_charset (enum LookupType type, const char *cs) |
Look for a preferred character set name. | |
int | mutt_ch_convert_nonmime_string (const struct Slist *const assumed_charset, const char *charset, char **ps) |
Try to convert a string using a list of character sets. | |
void | mutt_ch_canonical_charset (char *buf, size_t buflen, const char *name) |
Canonicalise the charset of a string. | |
bool | mutt_ch_chscmp (const char *cs1, const char *cs2) |
Are the names of two character sets equivalent? | |
const char * | mutt_ch_get_default_charset (const struct Slist *const assumed_charset) |
Get the default character set. | |
char * | mutt_ch_get_langinfo_charset (void) |
Get the user's choice of character set. | |
bool | mutt_ch_lookup_add (enum LookupType type, const char *pat, const char *replace, struct Buffer *err) |
Add a new character set lookup. | |
void | mutt_ch_lookup_remove (void) |
Remove all the character set lookups. | |
const char * | mutt_ch_charset_lookup (const char *chs) |
Look for a replacement character set. | |
iconv_t | mutt_ch_iconv_open (const char *tocode, const char *fromcode, uint8_t flags) |
Set up iconv for conversions. | |
size_t | mutt_ch_iconv (iconv_t cd, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft, const char **inrepls, const char *outrepl, int *iconverrno) |
Change the encoding of a string. | |
const char * | mutt_ch_iconv_lookup (const char *chs) |
Look for a replacement character set. | |
int | mutt_ch_check (const char *s, size_t slen, const char *from, const char *to) |
Check whether a string can be converted between encodings. | |
int | mutt_ch_convert_string (char **ps, const char *from, const char *to, uint8_t flags) |
Convert a string between encodings. | |
bool | mutt_ch_check_charset (const char *cs, bool strict) |
Does iconv understand a character set? | |
struct FgetConv * | mutt_ch_fgetconv_open (FILE *fp, const char *from, const char *to, uint8_t flags) |
Prepare a file for charset conversion. | |
void | mutt_ch_fgetconv_close (struct FgetConv **ptr) |
Close an fgetconv handle. | |
int | mutt_ch_fgetconv (struct FgetConv *fc) |
Convert a file's character set. | |
char * | mutt_ch_fgetconvs (char *buf, size_t buflen, struct FgetConv *fc) |
Convert a file's charset into a string buffer. | |
void | mutt_ch_set_charset (const char *charset) |
Update the records for a new character set. | |
char * | mutt_ch_choose (const char *fromcode, const struct Slist *charsets, const char *u, size_t ulen, char **d, size_t *dlen) |
Figure the best charset to encode a string. | |
void | mutt_ch_cache_cleanup (void) |
Clean up the cached iconv handles and charset strings. | |
Variables | |
wchar_t | ReplacementChar = '?' |
When a Unicode character can't be displayed, use this instead. | |
bool | CharsetIsUtf8 = false |
Is the user's current character set utf-8? | |
static struct LookupList | Lookups = TAILQ_HEAD_INITIALIZER(Lookups) |
Lookup table of preferred character set names. | |
static struct IconvCacheEntry | IconvCache [ICONV_CACHE_SIZE] |
Cache of iconv conversion descriptors. | |
static int | IconvCacheUsed = 0 |
Number of iconv descriptors in the cache. | |
static const struct MimeNames | PreferredMimeNames [] |
Lookup table of preferred charsets. | |
Conversion between different character encodings.
This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/.
Definition in file charset.c.
TAILQ_HEAD | ( | LookupList | , |
Lookup | |||
) |
|
static |
Create a new Lookup.
ptr | New Lookup |
Definition at line 269 of file charset.c.
|
static |
|
static |
Look for a preferred character set name.
type | Type, e.g. MUTT_LOOKUP_CHARSET |
cs | Character set |
ptr | Charset string |
If the character set matches one of the regexes, then return the replacement name.
Definition at line 303 of file charset.c.
int mutt_ch_convert_nonmime_string | ( | const struct Slist *const | assumed_charset, |
const char * | charset, | ||
char ** | ps | ||
) |
Try to convert a string using a list of character sets.
[in] | assumed_charset | From $assumed_charset |
[in] | charset | From $charset |
[in,out] | ps | String to be converted |
0 | Success |
-1 | Error |
Work through $assumed_charset
looking for a character set conversion that works. Failing that, try mutt_ch_get_default_charset().
Definition at line 331 of file charset.c.
void mutt_ch_canonical_charset | ( | char * | buf, |
size_t | buflen, | ||
const char * | name | ||
) |
Canonicalise the charset of a string.
buf | Buffer for canonical character set name |
buflen | Length of buffer |
name | Name to be canonicalised |
This first ties off any charset extension such as "//TRANSLIT", canonicalizes the charset and re-adds the extension
Definition at line 374 of file charset.c.
bool mutt_ch_chscmp | ( | const char * | cs1, |
const char * | cs2 | ||
) |
Are the names of two character sets equivalent?
cs1 | First character set |
cs2 | Second character set |
true | Names are equivalent |
false | Names differ |
Charsets may have extensions that mutt_ch_canonical_charset() leaves intact; we expect 'cs2' to originate from neomutt code, not user input (i.e. 'cs2' does not have any extension) we simply check if the shorter string is a prefix for the longer.
Definition at line 442 of file charset.c.
const char * mutt_ch_get_default_charset | ( | const struct Slist *const | assumed_charset | ) |
Get the default character set.
assumed_charset | From $assumed_charset |
ptr | Name of the default character set |
Definition at line 465 of file charset.c.
char * mutt_ch_get_langinfo_charset | ( | void | ) |
Get the user's choice of character set.
ptr | Charset string |
Get the canonical character set used by the user's locale. The caller must free the returned string.
Definition at line 486 of file charset.c.
bool mutt_ch_lookup_add | ( | enum LookupType | type, |
const char * | pat, | ||
const char * | replace, | ||
struct Buffer * | err | ||
) |
Add a new character set lookup.
type | Type of character set, e.g. MUTT_LOOKUP_CHARSET |
pat | Pattern to match |
replace | Replacement string |
err | Buffer for error message |
Add a regex for a character set and a replacement name.
Definition at line 509 of file charset.c.
void mutt_ch_lookup_remove | ( | void | ) |
Remove all the character set lookups.
Empty the list of replacement character set names.
Definition at line 541 of file charset.c.
const char * mutt_ch_charset_lookup | ( | const char * | chs | ) |
Look for a replacement character set.
chs | Character set to lookup |
ptr | Replacement character set (if a 'charset-hook' matches) |
NULL | No matching hook |
Look through all the 'charset-hook's. If one matches return the replacement character set.
Definition at line 562 of file charset.c.
iconv_t mutt_ch_iconv_open | ( | const char * | tocode, |
const char * | fromcode, | ||
uint8_t | flags | ||
) |
Set up iconv for conversions.
tocode | Current character set |
fromcode | Target character set |
flags | Flags, e.g. MUTT_ICONV_HOOK_FROM |
ptr | iconv handle for the conversion |
Like iconv_open, but canonicalises the charsets, applies charset-hooks, recanonicalises, and finally applies iconv-hooks. Parameter flags=0 skips charset-hooks, while MUTT_ICONV_HOOK_FROM applies them to fromcode. Callers should use flags=0 when fromcode can safely be considered true, either some constant, or some value provided by the user; MUTT_ICONV_HOOK_FROM should be used only when fromcode is unsure, taken from a possibly wrong incoming MIME label, or such. Misusing MUTT_ICONV_HOOK_FROM leads to unwanted interactions in some setups.
Since calling iconv_open() repeatedly can be expensive, we keep a cache of the most recently used iconv_t objects, kept in LRU order. This means that you should not call iconv_close() on the object yourself. All remaining objects in the cache will exit when main() calls mutt_ch_cache_cleanup().
Definition at line 594 of file charset.c.
size_t mutt_ch_iconv | ( | iconv_t | cd, |
const char ** | inbuf, | ||
size_t * | inbytesleft, | ||
char ** | outbuf, | ||
size_t * | outbytesleft, | ||
const char ** | inrepls, | ||
const char * | outrepl, | ||
int * | iconverrno | ||
) |
Change the encoding of a string.
[in] | cd | Iconv conversion descriptor |
[in,out] | inbuf | Buffer to convert |
[in,out] | inbytesleft | Length of buffer to convert |
[in,out] | outbuf | Buffer for the result |
[in,out] | outbytesleft | Length of result buffer |
[in] | inrepls | Input replacement characters |
[in] | outrepl | Output replacement characters |
[out] | iconverrno | Errno if iconv() fails, 0 if it succeeds |
num | Characters converted |
Like iconv, but keeps going even when the input is invalid If you're supplying inrepls, the source charset should be stateless; if you're supplying an outrepl, the target charset should be.
Definition at line 697 of file charset.c.
const char * mutt_ch_iconv_lookup | ( | const char * | chs | ) |
Look for a replacement character set.
chs | Character set to lookup |
ptr | Replacement character set (if a 'iconv-hook' matches) |
NULL | No matching hook |
Look through all the 'iconv-hook's. If one matches return the replacement character set.
Definition at line 781 of file charset.c.
int mutt_ch_check | ( | const char * | s, |
size_t | slen, | ||
const char * | from, | ||
const char * | to | ||
) |
Check whether a string can be converted between encodings.
[in] | s | String to check |
[in] | slen | Length of the string to check |
[in] | from | Current character set |
[in] | to | Target character set |
0 | Success |
-1 | Error in iconv_open() |
>0 | Errno as set by iconv() |
Definition at line 796 of file charset.c.
int mutt_ch_convert_string | ( | char ** | ps, |
const char * | from, | ||
const char * | to, | ||
uint8_t | flags | ||
) |
Convert a string between encodings.
[in,out] | ps | String to convert |
[in] | from | Current character set |
[in] | to | Target character set |
[in] | flags | Flags, e.g. MUTT_ICONV_HOOK_FROM |
0 | Success |
-1 | Invalid arguments or failure to open an iconv channel |
errno | Failure in iconv conversion |
Parameter flags is given as-is to mutt_ch_iconv_open(). See there for its meaning and usage policy.
Definition at line 831 of file charset.c.
bool mutt_ch_check_charset | ( | const char * | cs, |
bool | strict | ||
) |
Does iconv understand a character set?
cs | Character set to check |
strict | Check strictly by using iconv |
true | Character set is valid |
If strict
is false, then finding a matching character set in PreferredMimeNames will be enough. If strict
is true, or the charset is not in PreferredMimeNames, then iconv() with be run.
Definition at line 894 of file charset.c.
struct FgetConv * mutt_ch_fgetconv_open | ( | FILE * | fp, |
const char * | from, | ||
const char * | to, | ||
uint8_t | flags | ||
) |
Prepare a file for charset conversion.
fp | FILE ptr to prepare |
from | Current character set |
to | Destination character set |
flags | Flags, e.g. MUTT_ICONV_HOOK_FROM |
ptr | fgetconv handle |
Parameter flags is given as-is to mutt_ch_iconv_open().
Definition at line 933 of file charset.c.
void mutt_ch_fgetconv_close | ( | struct FgetConv ** | ptr | ) |
int mutt_ch_fgetconv | ( | struct FgetConv * | fc | ) |
Convert a file's character set.
fc | FgetConv handle |
num | Next character in the converted file |
EOF | Error |
A file is read into a buffer and its character set is converted. Each call to this function will return one converted character. The buffer is refilled automatically when empty.
Definition at line 983 of file charset.c.
char * mutt_ch_fgetconvs | ( | char * | buf, |
size_t | buflen, | ||
struct FgetConv * | fc | ||
) |
Convert a file's charset into a string buffer.
ptr | Success, result buffer |
NULL | Error |
Read a file into a buffer, converting the character set as it goes.
Definition at line 1045 of file charset.c.
void mutt_ch_set_charset | ( | const char * | charset | ) |
Update the records for a new character set.
charset | New character set |
Check if this character set is utf-8 and pick a suitable replacement character for unprintable characters.
bind_textdomain_codeset()
which will affect future message translations. Definition at line 1078 of file charset.c.
char * mutt_ch_choose | ( | const char * | fromcode, |
const struct Slist * | charsets, | ||
const char * | u, | ||
size_t | ulen, | ||
char ** | d, | ||
size_t * | dlen | ||
) |
Figure the best charset to encode a string.
[in] | fromcode | Original charset of the string |
[in] | charsets | List of potential charsets to use |
[in] | u | String to encode |
[in] | ulen | Length of the string to encode |
[out] | d | If not NULL, point it to the converted string |
[out] | dlen | If not NULL, point it to the length of the d string |
ptr | Best performing charset |
NULL | None could be found |
Definition at line 1111 of file charset.c.
void mutt_ch_cache_cleanup | ( | void | ) |
Clean up the cached iconv handles and charset strings.
Definition at line 1178 of file charset.c.
wchar_t ReplacementChar = '?' |
bool CharsetIsUtf8 = false |
|
static |
|
static |
|
static |
|
static |
Lookup table of preferred charsets.
The following list has been created manually from the data under: http://www.isi.edu/in-notes/iana/assignments/character-sets Last update: 2000-09-07