Note: We no longer publish the latest version of our code here. We primarily use a kumc-bmi github organization. The heron ETL repository, in particular, is not public. Peers in the informatics community should see MultiSiteDev for details on requesting access.

source: heron_load/verify_ascii_encoding.py @ 0:42ad7288920a

heron-michigan tip
Last change on this file since 0:42ad7288920a was 0:42ad7288920a, checked in by Matt Hoag <mhoag@…>, 6 years ago

Merge with demo_concepts_3800

File size: 3.6 KB
Line 
1''' Verify files in current directory are ASCII encoded
2'''
3import sys
4import os
5
6
7def find_line_col(s_bytes, char_offset):
8    '''Find line/column of offset (1-indexed to align with most text editors)
9      - Raises IndexError if the specified offset exceeds s_bytes-1 length
10
11    >>> find_line_col('abc\\ndef', 5) # the e
12    (2, 2)
13    >>> find_line_col('\\nabc\\ndef\\n\\njhi\\n\\nasdf', 10) # the j
14    (5, 1)
15    >>> find_line_col('', 4)
16    Traceback (most recent call last):
17        ...
18    IndexError: Offset 4 exceeds length of string (0)
19    '''
20    # offset includes newline
21    if char_offset >= len(s_bytes):
22        raise IndexError('Offset %d exceeds length of string (%s)' %
23                         (char_offset, len(s_bytes)))
24    line = s_bytes[:char_offset].count('\n') + 1
25    col = char_offset - s_bytes[:char_offset].rfind('\n', 0, char_offset)
26    return(line, col)
27
28
29def check_output_nonascii_file(path, opener, output):
30    '''Check file and report using output() function if non-ascii found
31      - Return 0 if no non-ascii charaters found (and report no output)
32      - Return 1 (and report using output()) if non-ascii characters found
33
34    >>> import StringIO
35    >>> s = StringIO.StringIO()
36    >>> s.write('abc\\ndef')
37    >>> s.seek(0)
38    >>> def opener(path, mode): return s
39    >>> def output(s): print s
40
41    None found:
42    >>> check_output_nonascii_file('somefile.sql', opener, output)
43    0
44
45    Non-ascii charater in file:
46    >>> s = StringIO.StringIO()
47    >>> s.write('abc\\ndef\\n\x92\\nghi')
48    >>> s.seek(0)
49    >>> check_output_nonascii_file('somefile.sql', opener, output)
50    Non-ASCII character (0x92) in somefile.sql: line 3, col 1 (1-indexed)
51    1
52
53    More than one non-ascii charater in file:
54    >>> s = StringIO.StringIO()
55    >>> s.write('abc\\ndef\\n\x92\\nghi\\n\\nabc\\ndef\\nq\x92\\nghi')
56    >>> s.seek(0)
57    >>> check_output_nonascii_file('somefile.sql', opener, output)
58    Non-ASCII character (0x92) in somefile.sql: line 3, col 1 (1-indexed)
59    Non-ASCII character (0x92) in somefile.sql: line 8, col 2 (1-indexed)
60    1
61
62    More than one non-ascii charater in file (side-by-side)
63    >>> s = StringIO.StringIO()
64    >>> s.write('\x92\x92\\n\x92')
65    >>> s.seek(0)
66    >>> check_output_nonascii_file('somefile.sql', opener, output)
67    Non-ASCII character (0x92) in somefile.sql: line 1, col 1 (1-indexed)
68    Non-ASCII character (0x92) in somefile.sql: line 1, col 2 (1-indexed)
69    Non-ASCII character (0x92) in somefile.sql: line 2, col 1 (1-indexed)
70    1
71    '''
72    f = opener(path, 'r')
73    s_bytes = f.read()
74    f.close()
75
76    offset = 0
77    ret = 0
78    while(offset < len(s_bytes)):
79        try:
80            s_bytes[offset:].decode('ascii')
81        except UnicodeDecodeError as ex:
82            line, col = find_line_col(s_bytes, ex.start + offset)
83            output('Non-ASCII character (0x%x) in %s: line %d, '
84                   'col %d (1-indexed)' % (ord(s_bytes[ex.start + offset]),
85                                           path, line, col))
86            offset += ex.start + 1
87            ret = 1
88        else:
89            break
90    return ret
91
92
93if __name__ == '__main__':
94
95    def output(s):
96        print s
97
98    def choose_files(argv=None):
99        # Ignore args for now - just take all .sql and .py files
100        basedir = os.path.dirname(os.path.realpath(__file__))
101        return tuple([os.path.join(basedir, f) for f in os.listdir(basedir)
102                      if os.path.splitext(f)[1].lower() in ('.py', '.sql')])
103
104    ret = 0
105    for f in choose_files(sys.argv[:]):
106        ret = max(ret, check_output_nonascii_file(f, open, output))
107
108    sys.exit(ret)
Note: See TracBrowser for help on using the repository browser.