Split requirements.txt parsing to its own module; test & improve it

Related: rhbz#1950291
2021-07-23 08:00:01 +00:00 · 2021-07-23 08:00:01 +00:00 · a3a1caf32a
commit a3a1caf32a
parent f190b5b225
6 changed files with 268 additions and 54 deletions
--- a/pyproject-rpm-macros.spec
+++ b/pyproject-rpm-macros.spec
@ -23,10 +23,12 @@ Source102:      pyproject_save_files.py
 Source103:      pyproject_convert.py
 Source104:      pyproject_preprocess_record.py
 Source105:      pyproject_construct_toxenv.py
+Source106:      pyproject_requirements_txt.py

 # Tests
 Source201:      test_pyproject_buildrequires.py
 Source202:      test_pyproject_save_files.py
+Source203:      test_pyproject_requirements_txt.py

 # Test data
 Source301:      pyproject_buildrequires_testcases.yaml
@ -95,6 +97,7 @@ install -m 644 pyproject_convert.py %{buildroot}%{_rpmconfigdir}/redhat/
 install -m 644 pyproject_save_files.py  %{buildroot}%{_rpmconfigdir}/redhat/
 install -m 644 pyproject_preprocess_record.py %{buildroot}%{_rpmconfigdir}/redhat/
 install -m 644 pyproject_construct_toxenv.py %{buildroot}%{_rpmconfigdir}/redhat/
+install -m 644 pyproject_requirements_txt.py %{buildroot}%{_rpmconfigdir}/redhat/

 %if %{with tests}
 %check
@ -110,6 +113,7 @@ export HOSTNAME="rpmbuild"  # to speedup tox in network-less mock, see rhbz#1856
 %{_rpmconfigdir}/redhat/pyproject_save_files.py
 %{_rpmconfigdir}/redhat/pyproject_preprocess_record.py
 %{_rpmconfigdir}/redhat/pyproject_construct_toxenv.py
+%{_rpmconfigdir}/redhat/pyproject_requirements_txt.py

 %doc README.md
 %license LICENSE
--- a/pyproject_buildrequires.py
+++ b/pyproject_buildrequires.py
@ -11,16 +11,14 @@ import re
 import tempfile
 import email.parser
 import pathlib
-import urllib
+
+from pyproject_requirements_txt import convert_requirements_txt


 # Some valid Python version specifiers are not supported.
 # Allow only the forms we know we can handle.
 VERSION_RE = re.compile(r'[a-zA-Z0-9.-]+(\.\*)?')

-# We treat this as comment in requirements files, as does pip
-COMMENT_RE = re.compile(r'(^|\s+)#.*$')
-

 class EndPass(Exception):
    """End current pass of generating requirements"""
@ -54,21 +52,16 @@ def hook_call():
        print_err('HOOK STDOUT:', line)


-def pkgname_from_egg_fragment(requirement_str):
-    parsed_url = urllib.parse.urlparse(requirement_str)
-    parsed_fragment = urllib.parse.parse_qs(parsed_url.fragment)
-    if 'egg' in parsed_fragment:
-        return parsed_fragment['egg'][0]
-    return None
-
-
 def guess_reason_for_invalid_requirement(requirement_str):
    if ':' in requirement_str:
-        return (
+        message = (
            'It might be an URL. '
            '%pyproject_buildrequires cannot handle all URL-based requirements. '
            'Add PackageName@ (see PEP 508) to the URL to at least require any version of PackageName.'
        )
+        if '@' in requirement_str:
+            message += ' (but note that URLs might not work well with other features)'
+        return message
    if '/' in requirement_str:
        return (
            'It might be a local path. '
@ -110,22 +103,18 @@ class Requirements:
                return True
        return False

-    def add(self, requirement_str, *, source=None, allow_egg_pkgname=False):
+    def add(self, requirement_str, *, source=None):
        """Output a Python-style requirement string as RPM dep"""
        print_err(f'Handling {requirement_str} from {source}')

        try:
            requirement = Requirement(requirement_str)
        except InvalidRequirement:
-            if allow_egg_pkgname and (egg_name := pkgname_from_egg_fragment(requirement_str)):
-                requirement = Requirement(egg_name)
-                requirement.url = requirement_str
-            else:
-                hint = guess_reason_for_invalid_requirement(requirement_str)
-                message = f'Requirement {requirement_str!r} from {source} is invalid.'
-                if hint:
-                    message += f' Hint: {hint}'
-                raise ValueError(message)
+            hint = guess_reason_for_invalid_requirement(requirement_str)
+            message = f'Requirement {requirement_str!r} from {source} is invalid.'
+            if hint:
+                message += f' Hint: {hint}'
+            raise ValueError(message)

        if requirement.url:
            print_err(
@ -276,27 +265,6 @@ def generate_run_requirements(backend, requirements):
        requirements.extend(requires, source=f'wheel metadata: {key}')


-def parse_requirements_lines(lines, path=None):
-    packages = []
-    for line in lines:
-        line = COMMENT_RE.sub('', line)
-        line = line.strip()
-        if line.startswith('-r'):
-            recursed_path = line[2:].strip()
-            if path:
-                recursed_path = path.parent / recursed_path
-            with open(recursed_path) as f:
-                packages.extend(parse_requirements_lines(f.read().splitlines(), recursed_path))
-        elif line.startswith('-'):
-            print_err(
-                f'WARNING: Skipping dependency line: {line}\n'
-                + f'    tox deps options other than -r are not supported (yet).',
-            )
-        elif line:
-            packages.append(line)
-    return packages
-
-
 def generate_tox_requirements(toxenv, requirements):
    toxenv = ','.join(toxenv)
    requirements.add('tox-current-env >= 0.0.6', source='tox itself')
@ -335,7 +303,7 @@ def generate_tox_requirements(toxenv, requirements):
            r.check_returncode()

        deplines = deps.read().splitlines()
-        packages = parse_requirements_lines(deplines)
+        packages = convert_requirements_txt(deplines)
        requirements.add_extras(*extras.read().splitlines())
        requirements.extend(packages,
                            source=f'tox --print-deps-only: {toxenv}')
@ -372,11 +340,10 @@ def generate_requires(
            raise ValueError('-N option cannot be used in combination with -r, -e, -t, -x options')
        if requirement_files:
            for req_file in requirement_files:
-                lines = req_file.read().splitlines()
-                packages = parse_requirements_lines(lines, pathlib.Path(req_file.name))
-                requirements.extend(packages,
-                                    source=f'requirements file {req_file.name}',
-                                    allow_egg_pkgname=True)
+                requirements.extend(
+                    convert_requirements_txt(req_file, pathlib.Path(req_file.name)),
+                    source=f'requirments file {req_file.name}'
+                )
            requirements.check(source='all requirement files')
        if use_build_system:
            backend = get_backend(requirements)
--- a/pyproject_buildrequires_testcases.yaml
+++ b/pyproject_buildrequires_testcases.yaml
@ -603,7 +603,7 @@ With pyproject.toml, requirements file and with -N option:
    python3dist(paramiko)
    python3dist(sqlalchemy)
    python3dist(spam)
-  stderr_contains: "WARNING: Simplifying 'git+https://github.com/monty/spam.git@master#egg=spam' to 'spam'."
+  stderr_contains: "WARNING: Simplifying 'spam@git+https://github.com/monty/spam.git@master#egg=spam' to 'spam'."
  result: 0

 With pyproject.toml, requirements file and without -N option:
@ -662,3 +662,57 @@ Value error if -N and -e arguments are present:
    - py3
  use_build_system: false
  except: ValueError
+
+Weird and complex requirements file:
+  installed:
+    setuptools: 50
+    wheel: 1
+  setup.py: |
+    from setuptools import setup
+    setup(
+        name='test',
+        version='0.1',
+    )
+  requirements.txt: |
+    Normal_Req ~= 1.2.0
+
+    good@git+https://github.com/monty/spam.git@master#egg=bad
+    git+https://github.com/monty/spam.git@master#egg=ugly
+
+    this-name-is-too-\
+    long-for-this-file<\
+    =30  # even names and operators can be split
+
+    # this is not a multi-line comment \
+    some-dep
+    other-dep  # but this *is* a multi-line coment \
+    so any garbage can be here
+    dep-a # and this comment ends with the blank line below \
+
+    dep-b
+    -r requirements2.txt
+    ${PACKAGE}${WANTED_VERSION}
+  requirements2.txt: |
+    dep-from-included-file
+  requirement_files:
+    - requirements.txt
+  environ:
+    PACKAGE: package
+    WANTED_VERSION: -from-environ >= 1.2.3
+  expected: |
+    (python3dist(normal-req) >= 1.2 with python3dist(normal-req) < 1.3)
+    python3dist(good)
+    python3dist(ugly)
+    python3dist(this-name-is-too-long-for-this-file) <= 30
+    python3dist(some-dep)
+    python3dist(other-dep)
+    python3dist(dep-a)
+    python3dist(dep-b)
+    python3dist(dep-from-included-file)
+    python3dist(package-from-environ) >= 1.2.3
+  stderr_contains:
+  - "WARNING: Simplifying 'good@git+https://github.com/monty/spam.git@master#egg=bad' to 'good'."
+  # XXX: pyproject_requirements_txt adds a prefix that's not actually in the source;
+  # but that's good enough:
+  - "WARNING: Simplifying 'ugly@git+https://github.com/monty/spam.git@master#egg=ugly' to 'ugly'."
+  result: 0
--- a/pyproject_requirements_txt.py
+++ b/pyproject_requirements_txt.py
@ -0,0 +1,103 @@
+"""Best-effort parser for requirements.txt files"""
+
+import urllib.parse
+from pathlib import Path
+import sys
+import os
+import re
+
+# `#` starts a comment only at end of line and after whitespace
+COMMENT_RE = re.compile(r'(^|\s+)#.*$')
+
+# Assume URLs start with a scheme; don't look for "egg=" URLs otherwise
+URL_START_RE = re.compile(r'^[-_+a-zA-Z0-9]+://')
+
+ENV_VAR_RE = re.compile(r'(?P<var>\$\{(?P<name>[A-Z0-9_]+)\})')
+PKGNAME_RE = re.compile(r'^[-_a-zA-Z0-9]+')
+
+# The requirements.txt format evolved rather organically; expect weirdness.
+
+def convert_requirements_txt(lines, path:Path = None):
+    """Convert lines of a requirements file to PEP 440-style requirement strs
+
+    This does NOT handle all of requitrements.txt features (only pip can do
+    that), but tries its best.
+
+    The resulting requirements might not actually be valid (either because
+    they're wrong in the file, or because we missed a special case).
+
+    path is the path to the requirements.txt file, used for options like `-r`.
+    """
+    requirements = []
+    lines = combine_logical_lines(lines)
+    lines = strip_comments(lines)
+    lines = expand_env_vars(lines)
+    if path:
+        filename = path.name
+    else:
+        filename = '<requirements file>'
+    for line in lines:
+        if URL_START_RE.match(line):
+            # Handle URLs with "egg=..." fragments
+            # see https://pip.pypa.io/en/stable/cli/pip_install/#vcs-support
+            parsed_url = urllib.parse.urlparse(line)
+            parsed_fragment = urllib.parse.parse_qs(parsed_url.fragment)
+            if 'egg' in parsed_fragment:
+                # Prepend the package name to the URL.
+                match = PKGNAME_RE.match(parsed_fragment['egg'][0])
+                if match:
+                    pkg_name = match[0]
+                    requirements.append(f'{pkg_name}@{line}')
+                    continue
+            # If that didn't work, pass the line on;
+            # the caller will deal with invalid requirements
+            requirements.append(line)
+        elif line.startswith('-r'):
+            recursed_path = line[2:].strip()
+            if path:
+                recursed_path = path.parent / recursed_path
+            recursed_path = Path(recursed_path)
+            with recursed_path.open() as f:
+                requirements.extend(convert_requirements_txt(f, recursed_path))
+        elif line.startswith('-'):
+            raise ValueError(f'{filename}: unsupported requirements file option: {line}')
+        else:
+            requirements.append(line)
+    return requirements
+
+def combine_logical_lines(lines):
+    """Combine logical lines together (backslash line-continuation)"""
+    pieces = []
+    for line in lines:
+        line = line.rstrip('\n')
+        # Whole-line comments *only* are removed before line-contionuation
+        if COMMENT_RE.match(line):
+            continue
+        if line.endswith('\\'):
+            pieces.append(line[:-1])
+        else:
+            # trailing whitespace is only removed from full logical lines
+            pieces.append(line.rstrip())
+            yield ''.join(pieces)
+            pieces = []
+    yield ''.join(pieces)
+
+
+def strip_comments(lines):
+    for line in lines:
+        line, *rest = COMMENT_RE.split(line, maxsplit=1)
+        line = line.strip()
+        if line:
+            yield line
+
+
+def expand_env_vars(lines):
+    def repl(match):
+        value = os.getenv(match['name'])
+        if value is None:
+            return match['var']
+        return value
+    for line in lines:
+        if match := ENV_VAR_RE.search(line):
+            var = match['var']
+        yield ENV_VAR_RE.sub(repl, line)
--- a/test_pyproject_buildrequires.py
+++ b/test_pyproject_buildrequires.py
@ -1,6 +1,5 @@
 from pathlib import Path
 import importlib.metadata
-import io

 import pytest
 import yaml
@ -29,6 +28,9 @@ def test_data(case_name, capsys, tmp_path, monkeypatch):
        if filename.endswith(file_types):
            cwd.joinpath(filename).write_text(case[filename])

+    for name, value in case.get('environ', {}).items():
+        monkeypatch.setenv(name, value)
+
    def get_installed_version(dist_name):
        try:
            return str(case['installed'][dist_name])
@ -65,8 +67,14 @@ def test_data(case_name, capsys, tmp_path, monkeypatch):

        if 'expected' in case:
            assert out == case['expected']
-        if 'stderr_contains' in case:
-            assert case['stderr_contains'] in err
+
+        # stderr_contains may be a string or list of strings
+        stderr_contains = case.get('stderr_contains')
+        if stderr_contains is not None:
+            if isinstance(stderr_contains, str):
+                stderr_contains = [stderr_contains]
+            for expected_substring in stderr_contains:
+                assert expected_substring in err
    finally:
        for req in requirement_files:
            req.close()
--- a/test_pyproject_requirements_txt.py
+++ b/test_pyproject_requirements_txt.py
@ -0,0 +1,78 @@
+from pathlib import Path
+from textwrap import dedent
+
+from pyproject_requirements_txt import convert_requirements_txt
+
+
+def test_requirements_add_pkgname():
+    reqs_txt = dedent(r"""
+        good@git+https://github.com/monty/spam.git@master#egg=bad
+        git+https://github.com/monty/spam.git@master#egg=ugly
+        https://example.com/undead.tar.gz#egg=undead ; python_version > 3.0
+    """)
+    result = convert_requirements_txt(reqs_txt.splitlines())
+
+    expected = [
+        'good@git+https://github.com/monty/spam.git@master#egg=bad',
+        'ugly@git+https://github.com/monty/spam.git@master#egg=ugly',
+        'undead@https://example.com/undead.tar.gz#egg=undead ; python_version > 3.0',
+    ]
+    assert result == expected
+
+
+def test_requirements_preprocess(monkeypatch):
+    reqs_txt = dedent(r"""
+        Normal_Req ~= 1.2.0
+           whitespace-stripped < 3    <END>
+
+        # indentation is preserved in continuations:
+        foo <=\
+            30
+        bar<=   \
+        30
+        # names and operators can be split:
+        this-was-\
+        too-long<\
+        =30  
+
+        # this is not a multi-line comment \
+        some-dep
+             # neither is this \
+        other-dep
+        another-dep  # but this *is* a multi-line coment \
+        so any garbage can be here
+        dep-a # and this comment ends with the blank line below \
+
+        dep-b
+        ${ENVVAR}
+        whitespace-stripped-before-substitution   ${SPACE}
+        ${MISSING_ENVVAR}
+    """.replace('<END>', ''))
+    monkeypatch.setenv('ENVVAR', 'package-from-env')
+    monkeypatch.setenv('SPACE', ' ')
+    monkeypatch.delenv('MISSING_ENVVAR', raising=False)
+    result = convert_requirements_txt(reqs_txt.splitlines())
+
+    expected = [
+        'Normal_Req ~= 1.2.0',
+        'whitespace-stripped < 3',
+        'foo <=    30',
+        'bar<=   30',
+        'this-was-too-long<=30',
+        'some-dep',
+        'other-dep',
+        'another-dep',
+        'dep-a',
+        'dep-b',
+        'package-from-env',
+        'whitespace-stripped-before-substitution    ',
+        '${MISSING_ENVVAR}',
+    ]
+    #result = expected
+    assert result == expected
+
+    # This test uses pip internals, so it might break in the future.
+    from pip._internal.req.req_file import preprocess
+    expected = [line for lineno, line in preprocess(reqs_txt)]
+    assert result == expected
+