Message 129259 - Python tracker

➜

This issue tracker has been migrated to GitHub, and is currently read-only.
For more information, see the GitHub FAQs in the Python's Developer Guide.

Author	sdaoden
Recipients	belopolsky, eric.araujo, ezio.melotti, jcea, lemburg, sdaoden
Date	2011-02-24.11:38:12
SpamBayes Score	2.440535e-10
Marked as misclassified	No
Message-id	<1298547495.11.0.706319063848.issue11303@psf.upfronthosting.co.za>
In-reply-to

Content
I wonder what this normalize_encoding() does! Here is a pretty standard version of mine which is a bit more expensive but catches match more cases! This is stripped, of course, and can be rewritten very easily to Python's needs (i.e. using char[32] instead of char[11]. * @@li If a character is either ::s_char_is_space() or ::s_char_is_punct(): * @@li Replace with ASCII space (0x20). * @@li Squeeze adjacent spaces to a single one. * @@li Else if a character is ::s_char_is_alnum(): * @@li ::s_char_to_lower() characters. * @@li Separate groups of alphas and digits with ASCII space (0x20). * @@li Else discard character. * E.g. "ISO_8859---1" becomes "iso 8859 1" * and "ISO8859-1" also becomes "iso 8859 1". s_textcodec_normalize_name(s_CString _name) { enum { C_NONE, C_WS, C_ALPHA, C_DIGIT } c_type = C_NONE; char name, c; auto s_CString input; s_cstring_swap(s_cstring_init(&input), _name); _name = s_cstring_reserve(_name, 31, s_FAL0); name = s_cstring_cstr(&input); while ((c = *(name++)) != s_NUL) { s_si8 sep = s_FAL0; if (s_char_is_space(c) \|\| s_char_is_punct(c)) { if (c_type == C_WS) continue; c_type = C_WS; c = ' '; } else if (s_char_is_alpha(c)) { sep = (c_type == C_DIGIT); c_type = C_ALPHA; c = s_char_to_lower(c); } else if (s_char_is_digit(c)) { sep = (c_type == C_ALPHA); c_type = C_DIGIT; } else continue; do _name = s_cstring_append_char(_name, (sep ? ' ' : c)); while (--sep >= s_FAL0); } s_cstring_destroy(&input); return _name; }

I wonder what this normalize_encoding() does!  Here is a pretty standard version of mine which is a bit more expensive but catches match more cases!  This is stripped, of course, and can be rewritten very easily to Python's needs (i.e. using char[32] instead of char[11].

 * @@li If a character is either ::s_char_is_space() or ::s_char_is_punct():
 *      @@li    Replace with ASCII space (0x20).
 *      @@li    Squeeze adjacent spaces to a single one.
 * @@li Else if a character is ::s_char_is_alnum():
 *      @@li    ::s_char_to_lower() characters.
 *      @@li    Separate groups of alphas and digits with ASCII space (0x20).
 * @@li Else discard character.
 * E.g. "ISO_8859---1" becomes "iso 8859 1"
 * and "ISO8859-1" also becomes "iso 8859 1".

s_textcodec_normalize_name(s_CString *_name) {
        enum { C_NONE, C_WS, C_ALPHA, C_DIGIT } c_type = C_NONE;
        char *name, c;
        auto s_CString input;

        s_cstring_swap(s_cstring_init(&input), _name);
        _name = s_cstring_reserve(_name, 31, s_FAL0);
        name = s_cstring_cstr(&input);

        while ((c = *(name++)) != s_NUL) {
                s_si8 sep = s_FAL0;

                if (s_char_is_space(c) || s_char_is_punct(c)) {
                        if (c_type == C_WS)
                                continue;
                        c_type = C_WS;
                        c = ' ';
                } else if (s_char_is_alpha(c)) {
                        sep = (c_type == C_DIGIT);
                        c_type = C_ALPHA;
                        c = s_char_to_lower(c);
                } else if (s_char_is_digit(c)) {
                        sep = (c_type == C_ALPHA);
                        c_type = C_DIGIT;
                } else
                        continue;

                do
                        _name = s_cstring_append_char(_name, (sep ? ' ' : c));
                while (--sep >= s_FAL0);
        }

        s_cstring_destroy(&input);
        return _name;
}

History
Date	User	Action	Args
2011-02-24 11:38:15	sdaoden	set	recipients: + sdaoden, lemburg, jcea, belopolsky, ezio.melotti, eric.araujo
2011-02-24 11:38:15	sdaoden	set	messageid: <1298547495.11.0.706319063848.issue11303@psf.upfronthosting.co.za>
2011-02-24 11:38:12	sdaoden	link	issue11303 messages
2011-02-24 11:38:12	sdaoden	create