use chardet for encoding detection
This commit is contained in:
2
setup.py
2
setup.py
@@ -51,6 +51,6 @@ setup(
|
|||||||
packages=find_packages(exclude=['tests', 'tests.*']),
|
packages=find_packages(exclude=['tests', 'tests.*']),
|
||||||
entry_points={'console_scripts': ['yamllint=yamllint.cli:run']},
|
entry_points={'console_scripts': ['yamllint=yamllint.cli:run']},
|
||||||
package_data={'yamllint': ['conf/*.yaml']},
|
package_data={'yamllint': ['conf/*.yaml']},
|
||||||
install_requires=['pathspec >=0.5.3', 'pyyaml'],
|
install_requires=['pathspec >=0.5.3', 'pyyaml', 'chardet'],
|
||||||
test_suite='tests',
|
test_suite='tests',
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -92,9 +92,9 @@ class CommandLineTestCase(unittest.TestCase):
|
|||||||
'dos.yml': '---\r\n'
|
'dos.yml': '---\r\n'
|
||||||
'dos: true',
|
'dos: true',
|
||||||
# UTF-16 Little Endian BOM
|
# UTF-16 Little Endian BOM
|
||||||
'non-ascii/utf16le': b'\xff\xfe---\nutf16le: true\n',
|
'non-ascii/utf16le': b'\xff\xfe' + u'---\nutf16le: true\n'.encode('utf-16-le'),
|
||||||
# UTF-16 Big Endian
|
# UTF-16 Big Endian
|
||||||
'non-ascii/utf16be': b'\xfe\xff---\nutf16be: true\n',
|
'non-ascii/utf16be': b'\xfe\xff' + u'---\nutf16be: true\n'.encode('utf-16-be'),
|
||||||
# UTF-8 BOM
|
# UTF-8 BOM
|
||||||
'non-ascii/utf8': b'\xef\xbb\xbf---\nutf8: true\n',
|
'non-ascii/utf8': b'\xef\xbb\xbf---\nutf8: true\n',
|
||||||
})
|
})
|
||||||
@@ -532,15 +532,18 @@ class CommandLineTestCase(unittest.TestCase):
|
|||||||
|
|
||||||
def test_encoding_detection_utf16le(self):
|
def test_encoding_detection_utf16le(self):
|
||||||
path = os.path.join(self.wd, 'non-ascii/utf16le')
|
path = os.path.join(self.wd, 'non-ascii/utf16le')
|
||||||
encoding = cli.determine_encoding(path)
|
with RunContext(self) as ctx:
|
||||||
self.assertEqual(encoding, 'utf-16-le')
|
cli.run(('-f', 'parsable', path))
|
||||||
|
self.assertEqual((ctx.returncode, ctx.stdout, ctx.stderr), (0, '', ''))
|
||||||
|
|
||||||
def test_encoding_detection_utf16be(self):
|
def test_encoding_detection_utf16be(self):
|
||||||
path = os.path.join(self.wd, 'non-ascii/utf16be')
|
path = os.path.join(self.wd, 'non-ascii/utf16be')
|
||||||
encoding = cli.determine_encoding(path)
|
with RunContext(self) as ctx:
|
||||||
self.assertEqual(encoding, 'utf-16-be')
|
cli.run(('-f', 'parsable', path))
|
||||||
|
self.assertEqual((ctx.returncode, ctx.stdout, ctx.stderr), (0, '', ''))
|
||||||
|
|
||||||
def test_encoding_detection_utf8(self):
|
def test_encoding_detection_utf8(self):
|
||||||
path = os.path.join(self.wd, 'non-ascii/utf8')
|
path = os.path.join(self.wd, 'non-ascii/utf8')
|
||||||
encoding = cli.determine_encoding(path)
|
with RunContext(self) as ctx:
|
||||||
self.assertEqual(encoding, 'utf-8')
|
cli.run(('-f', 'parsable', path))
|
||||||
|
self.assertEqual((ctx.returncode, ctx.stdout, ctx.stderr), (0, '', ''))
|
||||||
|
|||||||
@@ -17,7 +17,8 @@
|
|||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import codecs
|
import chardet
|
||||||
|
import contextlib
|
||||||
import io
|
import io
|
||||||
import os
|
import os
|
||||||
import platform
|
import platform
|
||||||
@@ -29,16 +30,16 @@ from yamllint.config import YamlLintConfig, YamlLintConfigError
|
|||||||
from yamllint.linter import PROBLEM_LEVELS
|
from yamllint.linter import PROBLEM_LEVELS
|
||||||
|
|
||||||
|
|
||||||
def determine_encoding(file):
|
@contextlib.contextmanager
|
||||||
with io.open(file, 'rb') as raw_file:
|
def yamlopen(fp, **iowrapper_kwargs):
|
||||||
data = raw_file.read(4)
|
encoding = iowrapper_kwargs.pop('encoding', None)
|
||||||
if data.startswith(codecs.BOM_UTF16_LE):
|
with io.open(fp, mode='rb') as raw_file:
|
||||||
encoding = 'utf-16-le'
|
if encoding is None:
|
||||||
elif data.startswith(codecs.BOM_UTF16_BE):
|
raw_data = raw_file.read()
|
||||||
encoding = 'utf-16-be'
|
encoding = chardet.detect(raw_data).get('encoding') or 'utf-8'
|
||||||
else:
|
raw_file.seek(0)
|
||||||
encoding = 'utf-8'
|
with io.TextIOWrapper(raw_file, encoding=encoding, **iowrapper_kwargs) as decoded:
|
||||||
return encoding
|
yield decoded
|
||||||
|
|
||||||
|
|
||||||
def find_files_recursively(items, conf):
|
def find_files_recursively(items, conf):
|
||||||
@@ -190,8 +191,7 @@ def run(argv=None):
|
|||||||
for file in find_files_recursively(args.files, conf):
|
for file in find_files_recursively(args.files, conf):
|
||||||
filepath = file[2:] if file.startswith('./') else file
|
filepath = file[2:] if file.startswith('./') else file
|
||||||
try:
|
try:
|
||||||
encoding = determine_encoding(file)
|
with yamlopen(file, newline='') as f:
|
||||||
with io.open(file, newline='', encoding=encoding) as f:
|
|
||||||
problems = linter.run(f, conf, filepath)
|
problems = linter.run(f, conf, filepath)
|
||||||
except EnvironmentError as e:
|
except EnvironmentError as e:
|
||||||
print(e, file=sys.stderr)
|
print(e, file=sys.stderr)
|
||||||
|
|||||||
Reference in New Issue
Block a user