Message129259
I wonder what this normalize_encoding() does! Here is a pretty standard version of mine which is a bit more expensive but catches match more cases! This is stripped, of course, and can be rewritten very easily to Python's needs (i.e. using char[32] instead of char[11].
* @@li If a character is either ::s_char_is_space() or ::s_char_is_punct():
* @@li Replace with ASCII space (0x20).
* @@li Squeeze adjacent spaces to a single one.
* @@li Else if a character is ::s_char_is_alnum():
* @@li ::s_char_to_lower() characters.
* @@li Separate groups of alphas and digits with ASCII space (0x20).
* @@li Else discard character.
* E.g. "ISO_8859---1" becomes "iso 8859 1"
* and "ISO8859-1" also becomes "iso 8859 1".
s_textcodec_normalize_name(s_CString *_name) {
enum { C_NONE, C_WS, C_ALPHA, C_DIGIT } c_type = C_NONE;
char *name, c;
auto s_CString input;
s_cstring_swap(s_cstring_init(&input), _name);
_name = s_cstring_reserve(_name, 31, s_FAL0);
name = s_cstring_cstr(&input);
while ((c = *(name++)) != s_NUL) {
s_si8 sep = s_FAL0;
if (s_char_is_space(c) || s_char_is_punct(c)) {
if (c_type == C_WS)
continue;
c_type = C_WS;
c = ' ';
} else if (s_char_is_alpha(c)) {
sep = (c_type == C_DIGIT);
c_type = C_ALPHA;
c = s_char_to_lower(c);
} else if (s_char_is_digit(c)) {
sep = (c_type == C_ALPHA);
c_type = C_DIGIT;
} else
continue;
do
_name = s_cstring_append_char(_name, (sep ? ' ' : c));
while (--sep >= s_FAL0);
}
s_cstring_destroy(&input);
return _name;
} |
|
Date |
User |
Action |
Args |
2011-02-24 11:38:15 | sdaoden | set | recipients:
+ sdaoden, lemburg, jcea, belopolsky, ezio.melotti, eric.araujo |
2011-02-24 11:38:15 | sdaoden | set | messageid: <1298547495.11.0.706319063848.issue11303@psf.upfronthosting.co.za> |
2011-02-24 11:38:12 | sdaoden | link | issue11303 messages |
2011-02-24 11:38:12 | sdaoden | create | |
|