Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
129 changes: 129 additions & 0 deletions vulnerabilities/pipelines/v2_importers/collect_repo_fix_commits.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
import re
import shutil
import tempfile
from collections import defaultdict

from git import Repo

from vulnerabilities.importer import AdvisoryData
from vulnerabilities.importer import AffectedPackageV2
from vulnerabilities.importer import PackageCommitPatchData
from vulnerabilities.pipelines import VulnerableCodeBaseImporterPipelineV2

SECURITY_PATTERNS = [
r"\bCVE-\d{4}-\d{4,19}\b",
r"\bGHSA-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{4}\b",
r"\bPYSEC-\d{4}-\d{1,6}\b",
r"\bXSA-\d{1,4}\b",
]


class CollectRepoFixCommitPipeline(VulnerableCodeBaseImporterPipelineV2):
"""
Pipeline to collect fix commits from any git repository.
"""

pipeline_id = "collect_fix_commit"

@classmethod
def steps(cls):
return (
cls.clone,
cls.collect_and_store_advisories,
cls.clean_downloads,
)

def clone(self):
"""Clone the repository."""
self.repo_url = self.inputs["repo_url"]
if not self.repo_url:
raise ValueError("Repo is required for CollectRepoFixCommitPipeline")

self.purl = self.inputs["purl"]
self.repo = Repo.clone_from(
url=self.repo_url,
to_path=tempfile.mkdtemp(),
bare=True,
no_checkout=True,
multi_options=["--filter=blob:none"],
)

def advisories_count(self) -> int:
return 0

def extract_vulnerability_id(self, commit) -> list[str]:
"""
Extract vulnerability id from a commit message.
Returns a list of matched vulnerability IDs
"""
matches = []
for pattern in SECURITY_PATTERNS:
found = re.findall(pattern, commit.message, flags=re.IGNORECASE)
matches.extend(found)
return matches

def collect_fix_commits(self):
"""
Iterate through repository commits and group them by vulnerability identifiers.
return a list with (vuln_id, [(commit_id, commit_message)]).
"""
self.log("Processing git repository fix commits (grouped by vulnerability IDs).")

grouped_commits = defaultdict(list)
for commit in self.repo.iter_commits("--all"):
matched_ids = self.extract_vulnerability_id(commit)
if not matched_ids:
continue

commit_id = commit.hexsha
commit_message = commit.message.strip()

for vuln_id in matched_ids:
grouped_commits[vuln_id].append((commit_id, commit_message))

self.log(f"Found {len(grouped_commits)} vulnerabilities with related commits.")
self.log("Finished processing all commits.")
return grouped_commits

def collect_advisories(self):
"""
Generate AdvisoryData objects for each vulnerability ID grouped with its related commits.
"""
self.log("Generating AdvisoryData objects from grouped commits.")
grouped_commits = self.collect_fix_commits()
for vuln_id, commits_data in grouped_commits.items():
if not commits_data or not vuln_id:
continue

summary_lines = []
for c_hash, msg in commits_data:
summary_lines.append(f"{c_hash}: {msg}")
summary = f"Commits fixing {vuln_id}:\n" + "\n".join(summary_lines)

commit_hash_set = {commit_hash for commit_hash, _ in commits_data}
affected_packages = [
AffectedPackageV2(
package=self.purl,
fixed_by_commit_patches=[
PackageCommitPatchData(vcs_url=self.repo_url, commit_hash=commit_hash)
for commit_hash in commit_hash_set
],
)
]

yield AdvisoryData(
advisory_id=vuln_id,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will be problematic since we intend to collect fixed commits from multiple different repo here. Suppose we get a fix commit for CVE-000-000 in two different repo, we will end up with a conflict while inserting the advisory, as we use advisory_id prefixed with the pipeline_id to create unique AVID. In this case we will end up with same AVID for fix commits imported from two different git repos.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@keshav-space Not sure what the best solution for this is, but based on my understanding, we can make the pipeline_id dynamic

  • django_fix_commit
  • django_restframework_fix_commit

For example:

avid: "e.g., django_fix_commit/PYSEC-2020-2233"
avid: "e.g., django_restframework_fix_commit/PYSEC-2020-2233"

This will generate a different avid for each Git repository.
I’m not sure if this completely solves the problem, though.

summary=summary,
affected_packages=affected_packages,
url=self.repo_url,
)

def clean_downloads(self):
"""Cleanup any temporary repository data."""
self.log("Cleaning up local repository resources.")
if hasattr(self, "repo") and self.repo.working_dir:
shutil.rmtree(path=self.repo.working_dir)

def on_failure(self):
"""Ensure cleanup is always performed on failure."""
self.clean_downloads()
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
#
# Copyright (c) nexB Inc. and others. All rights reserved.
# VulnerableCode is a trademark of nexB Inc.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/aboutcode-org/vulnerablecode for support or download.
# See https://aboutcode.org for more information about nexB OSS projects.
#

import json
from pathlib import Path
from unittest import TestCase
from unittest.mock import MagicMock
from unittest.mock import patch

import pytest
from packageurl import PackageURL

from vulnerabilities.pipelines.v2_importers.collect_repo_fix_commits import (
CollectRepoFixCommitPipeline,
)
from vulnerabilities.tests import util_tests


@pytest.fixture
def pipeline():
pipeline = CollectRepoFixCommitPipeline()
pipeline.repo_url = "https://github.com/test/repo"
pipeline.log = MagicMock()
return pipeline


def test_classify_commit_type_extracts_ids(pipeline):
class DummyCommit:
message = "Fix for CVE-2023-1234 and GHSA-2479-qvv7-47qq"

result = pipeline.extract_vulnerability_id(DummyCommit)
assert result == ["CVE-2023-1234", "GHSA-2479-qvv7-47qq"]


@patch("vulnerabilities.pipelines.v2_importers.collect_repo_fix_commits.Repo")
def test_collect_fix_commits_groups_by_vuln(mock_repo, pipeline):
commit1 = MagicMock(message="Fix CVE-2021-0001", hexsha="abc123")
commit2 = MagicMock(message="Patch GHSA-dead-beef-baad", hexsha="def456")
commit3 = MagicMock(message="Unrelated change", hexsha="ghi789")

pipeline.repo = MagicMock()
pipeline.repo.iter_commits.return_value = [commit1, commit2, commit3]

pipeline.classify_commit_type = MagicMock(
side_effect=lambda c: (
["CVE-2021-0001"]
if "CVE" in c.message
else ["GHSA-dead-beef-baad"]
if "GHSA" in c.message
else []
)
)

grouped = pipeline.collect_fix_commits()

expected = {
"CVE-2021-0001": [("abc123", "Fix CVE-2021-0001")],
"GHSA-dead-beef-baad": [("def456", "Patch GHSA-dead-beef-baad")],
}

assert grouped == expected


TEST_DATA = Path(__file__).parent.parent.parent / "test_data" / "fix_commits"


class TestRepoFixCommitPipeline(TestCase):
def test_collect_advisories_from_json(self):
input_file = TEST_DATA / "grouped_commits_input.json"
expected_file = TEST_DATA / "expected_linux_advisory_output.json"

grouped_commits = json.loads(input_file.read_text(encoding="utf-8"))

pipeline = CollectRepoFixCommitPipeline()
pipeline.repo_url = "https://github.com/test/repo"
pipeline.purl = PackageURL.from_string("pkg:generic/test")
pipeline.log = MagicMock()
pipeline.collect_fix_commits = MagicMock(return_value=grouped_commits)

result = [adv.to_dict() for adv in pipeline.collect_advisories()]

util_tests.check_results_against_json(result, expected_file, True)


@pytest.mark.parametrize(
"commit_message, expected_ids",
[
("Fix CVE-2023-12345 buffer overflow", ["CVE-2023-12345"]),
("Address GHSA-abcd-1234-efgh report", ["GHSA-abcd-1234-efgh"]),
("Python security PYSEC-2021-12345 fix", ["PYSEC-2021-12345"]),
("Xen XSA-43 security update", ["XSA-43"]),
(
"Fix CVE-2023-1111 and GHSA-aaaa-bbbb-cccc in kernel",
["CVE-2023-1111", "GHSA-aaaa-bbbb-cccc"],
),
("Refactor logging system with no security ID", []),
],
)
def test_classify_commit_type_detects_vuln_ids(pipeline, commit_message, expected_ids):
"""Ensure classify_commit_type correctly extracts vulnerability IDs."""

class DummyCommit:
def __init__(self, message):
self.message = message

commit = DummyCommit(commit_message)
result = pipeline.extract_vulnerability_id(commit)

assert result == expected_ids, f"Unexpected result for message: {commit_message}"


def test_classify_commit_type_case_insensitive(pipeline):
"""Ensure pattern matching is case-insensitive."""

class DummyCommit:
message = "fix cVe-2022-9999 and ghSa-dead-beef-baad"

result = pipeline.extract_vulnerability_id(DummyCommit)
assert any("CVE-2022-9999" in r.upper() for r in result)
assert any("GHSA-DEAD-BEEF-BAAD" in r.upper() for r in result)
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
[
{
"advisory_id": "CVE-2021-0001",
"aliases": [],
"summary": "Commits fixing CVE-2021-0001:\n41b43c74bda19753c757036673ea9db74acf494a: Fixed CVE-2025-59681 -- Protected QuerySet.annotate(), alias(), aggregate(), and extra() against SQL injection in column aliases on MySQL/MariaDB.",
"affected_packages": [
{
"package": {
"type": "generic",
"namespace": "",
"name": "test",
"version": "",
"qualifiers": "",
"subpath": ""
},
"affected_version_range": null,
"fixed_version_range": null,
"introduced_by_commit_patches": [],
"fixed_by_commit_patches": [
{
"vcs_url": "https://github.com/test/repo",
"commit_hash": "41b43c74bda19753c757036673ea9db74acf494a",
"patch_text": null,
"patch_checksum": null
}
]
}
],
"references_v2": [],
"patches": [],
"severities": [],
"date_published": null,
"weaknesses": [],
"url": "https://github.com/test/repo"
},
{
"advisory_id": "GHSA-dead-beef-baad",
"aliases": [],
"summary": "Commits fixing GHSA-dead-beef-baad:\n49ff1042aa66bb25eda87e9a8ef82f3b0ad4eeba: Fixed CVE-2024-53907 -- Mitigated potential DoS in strip_tags().",
"affected_packages": [
{
"package": {
"type": "generic",
"namespace": "",
"name": "test",
"version": "",
"qualifiers": "",
"subpath": ""
},
"affected_version_range": null,
"fixed_version_range": null,
"introduced_by_commit_patches": [],
"fixed_by_commit_patches": [
{
"vcs_url": "https://github.com/test/repo",
"commit_hash": "49ff1042aa66bb25eda87e9a8ef82f3b0ad4eeba",
"patch_text": null,
"patch_checksum": null
}
]
}
],
"references_v2": [],
"patches": [],
"severities": [],
"date_published": null,
"weaknesses": [],
"url": "https://github.com/test/repo"
}
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"CVE-2021-0001": [
["41b43c74bda19753c757036673ea9db74acf494a", "Fixed CVE-2025-59681 -- Protected QuerySet.annotate(), alias(), aggregate(), and extra() against SQL injection in column aliases on MySQL/MariaDB."]
],
"GHSA-dead-beef-baad": [
["49ff1042aa66bb25eda87e9a8ef82f3b0ad4eeba", "Fixed CVE-2024-53907 -- Mitigated potential DoS in strip_tags()."]
]
}