diff --git a/setup.py b/setup.py index ffa2ee2..df6cfeb 100644 --- a/setup.py +++ b/setup.py @@ -51,6 +51,6 @@ setup( packages=find_packages(exclude=['tests', 'tests.*']), entry_points={'console_scripts': ['yamllint=yamllint.cli:run']}, package_data={'yamllint': ['conf/*.yaml']}, - install_requires=['pathspec >=0.5.3', 'pyyaml'], + install_requires=['pathspec >=0.5.3', 'pyyaml', 'chardet'], test_suite='tests', ) diff --git a/tests/test_cli.py b/tests/test_cli.py index a671cdf..76fef52 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -92,9 +92,9 @@ class CommandLineTestCase(unittest.TestCase): 'dos.yml': '---\r\n' 'dos: true', # UTF-16 Little Endian BOM - 'non-ascii/utf16le': b'\xff\xfe---\nutf16le: true\n', + 'non-ascii/utf16le': b'\xff\xfe' + u'---\nutf16le: true\n'.encode('utf-16-le'), # UTF-16 Big Endian - 'non-ascii/utf16be': b'\xfe\xff---\nutf16be: true\n', + 'non-ascii/utf16be': b'\xfe\xff' + u'---\nutf16be: true\n'.encode('utf-16-be'), # UTF-8 BOM 'non-ascii/utf8': b'\xef\xbb\xbf---\nutf8: true\n', }) @@ -532,15 +532,18 @@ class CommandLineTestCase(unittest.TestCase): def test_encoding_detection_utf16le(self): path = os.path.join(self.wd, 'non-ascii/utf16le') - encoding = cli.determine_encoding(path) - self.assertEqual(encoding, 'utf-16-le') + with RunContext(self) as ctx: + cli.run(('-f', 'parsable', path)) + self.assertEqual((ctx.returncode, ctx.stdout, ctx.stderr), (0, '', '')) def test_encoding_detection_utf16be(self): path = os.path.join(self.wd, 'non-ascii/utf16be') - encoding = cli.determine_encoding(path) - self.assertEqual(encoding, 'utf-16-be') + with RunContext(self) as ctx: + cli.run(('-f', 'parsable', path)) + self.assertEqual((ctx.returncode, ctx.stdout, ctx.stderr), (0, '', '')) def test_encoding_detection_utf8(self): path = os.path.join(self.wd, 'non-ascii/utf8') - encoding = cli.determine_encoding(path) - self.assertEqual(encoding, 'utf-8') + with RunContext(self) as ctx: + cli.run(('-f', 'parsable', path)) + self.assertEqual((ctx.returncode, ctx.stdout, ctx.stderr), (0, '', '')) diff --git a/yamllint/cli.py b/yamllint/cli.py index d84c6f0..385c27f 100644 --- a/yamllint/cli.py +++ b/yamllint/cli.py @@ -17,7 +17,8 @@ from __future__ import print_function import argparse -import codecs +import chardet +import contextlib import io import os import platform @@ -29,16 +30,16 @@ from yamllint.config import YamlLintConfig, YamlLintConfigError from yamllint.linter import PROBLEM_LEVELS -def determine_encoding(file): - with io.open(file, 'rb') as raw_file: - data = raw_file.read(4) - if data.startswith(codecs.BOM_UTF16_LE): - encoding = 'utf-16-le' - elif data.startswith(codecs.BOM_UTF16_BE): - encoding = 'utf-16-be' - else: - encoding = 'utf-8' - return encoding +@contextlib.contextmanager +def yamlopen(fp, **iowrapper_kwargs): + encoding = iowrapper_kwargs.pop('encoding', None) + with io.open(fp, mode='rb') as raw_file: + if encoding is None: + raw_data = raw_file.read() + encoding = chardet.detect(raw_data).get('encoding') or 'utf-8' + raw_file.seek(0) + with io.TextIOWrapper(raw_file, encoding=encoding, **iowrapper_kwargs) as decoded: + yield decoded def find_files_recursively(items, conf): @@ -190,8 +191,7 @@ def run(argv=None): for file in find_files_recursively(args.files, conf): filepath = file[2:] if file.startswith('./') else file try: - encoding = determine_encoding(file) - with io.open(file, newline='', encoding=encoding) as f: + with yamlopen(file, newline='') as f: problems = linter.run(f, conf, filepath) except EnvironmentError as e: print(e, file=sys.stderr)