# Generates urlencoded.txt from utf-8.txt
#
# urlencoded.txt is used by Tests_Formatting_Utf8UriEncode
import urllib, codecs, re
import sys
# uncapitalize pct-encoded values, leave the rest alone
capfix = re.compile("%([0-9A-Z]{2})");
def fix(match):
octet = match.group(1)
intval = int(octet, 16)
if intval < 128:
return chr(intval).lower()
return '%' + octet.lower()
def urlencode(line):
"""Percent-encode each byte of non-ASCII unicode characters."""
line = urllib.quote(line.strip().encode("utf-8"))
line = capfix.sub(fix, line)
return line
if __name__ == "__main__":
args = sys.argv[1:]
if args and args[0] in ("-h", "--help"):
print "Usage: python urlencode.py < utf-8.txt > urlencoded.txt"
sys.exit(2)
sys.stdin = codecs.getreader("utf-8")(sys.stdin)
sys.stdout = codecs.getwriter("ascii")(sys.stdout)
lines = sys.stdin.readlines()
sys.stdout.write( "\n".join(map(urlencode, lines)) )
|