Source code for sphinx_immaterial.apidoc.cpp.api_parser

"""Converts a preprocessed C++ source file into a JSON API description.

This program expects as input preprocessed C++ source code obtained using the
`-E` (preprocessed), `-C` (preserve comments), and `-dD` (preserve macro
definitions) GCC/Clang options.  It also accepts a list of compiler options to
be passed to Clang, although these are mostly irrelevant due to the prior
preprocessing.

It uses a combination of the libclang Python bindings and the C++ parser in the
Sphinx Python package to parse declarations and definitions and produce a JSON
representation of the API.

The JSON representation describes the API as a set of "entities", each with a
unique id, and relations between them.

Libclang can handle arbitrary C++ syntax, but provides very restricted access to
the resultant AST.  To workaround those limitations, in some cases extracted
declarations or portions of declarations are converted back to source
representation and re-parsed using the C++ parser provided by the Sphinx C++
domain.  The C++ parser in Sphinx is extremely limited and as it does not rely
on a symbol table does not always correctly handle template arguments, but
provides full access to its AST.

This program performs a number of transformations on the declarations:

- Exported entities are filtered using various criteria

- `std::enable_if_t` uses are converted to C++20 requires clauses.

- Internal return types are elided (replaced with auto).
"""

import argparse
import ctypes
import dataclasses
import functools
import json
import os
import pathlib
import re
import time
import typing
from textwrap import dedent
from typing import (
    Any,
    Callable,
    Dict,
    Iterator,
    List,
    Literal,
    Optional,
    Pattern,
    Sequence,
    Tuple,
    TypedDict,
    Union,
    cast,
)

import clang.cindex
import docutils.nodes
import pydantic.dataclasses
import sphinx.domains.cpp
import sphinx.util.logging
from clang.cindex import (
    Cursor,
    CursorKind,
    SourceLocation,
    SourceRange,
    Token,
    TokenKind,
    TranslationUnit,
)
from typing_extensions import NotRequired

from . import ast_fixes  # noqa: F401

logger = sphinx.util.logging.getLogger(__name__)

_UNMATCHABLE_REGEXP = re.compile("a^")


def _combine_regexp_list(items: Sequence[Union[str, Pattern[str]]]) -> re.Pattern:
    if not items:
        return _UNMATCHABLE_REGEXP

    def get_parenthesized_source(x: Union[str, Pattern[str]]):
        if isinstance(x, re.Pattern):
            x = x.pattern
        return f"(?:{x})"

    return re.compile("|".join(get_parenthesized_source(x) for x in items))


def _make_replacement_pattern(
    strings: List[str], prefix: str, suffix: str
) -> re.Pattern:
    if not strings:
        return _UNMATCHABLE_REGEXP
    return re.compile(
        "|".join(rf"(?:{prefix}{re.escape(before)}{suffix})" for before in strings)
    )


TEMPLATE_PARAMETER_ENABLE_IF_TYPE_PATTERN = re.compile(
    r"\s*(?:typename|class)\s*=\s*std\s*::\s*enable_if_t\s*<(.*)>\s*"
)
TEMPLATE_PARAMETER_ENABLE_IF_NON_TYPE_PATTERN = re.compile(
    r"\s*std\s*::\s*enable_if_t\s*<(.*)>\s*\*\s*=\s*(nullptr|0)\s*"
)

SPECIAL_GROUP_COMMAND_PATTERN = re.compile(
    r"^(?:\\|@)(ingroup|relates|membergroup|id)\s+(.*[^\s])\s*$", re.MULTILINE
)



[docs]
@pydantic.dataclasses.dataclass
class Config:
    """Specifies a C++ API parsing configuration.

    Based on this configuration, a description of the API is generated.
    """

    input_path: str = "__input.cpp"
    """Path to the input file to parse.

    This may either be a path to an existing file, or `.input_content` may
    specify its content, in which case the filesystem is not accessed.

    If `.input_content` is specified and merely contains :cpp:`#include`
    directives, then the actual path does not matter and may be left as the
    default value.
    """

    input_content: Optional[bytes] = None
    """Specifies the content of `.input_path`.

    If unspecified, the content is read from filesystem.
    """

    compiler_flags: List[str] = dataclasses.field(default_factory=list)
    """List of compiler flags to pass to Clang."""

    verbose: bool = False
    """Parse in verbose mode."""

    include_directory_map: Dict[str, str] = dataclasses.field(default_factory=dict)
    """Maps actual include directories to a displayed directory name.

    The keys should be prefixes of paths specified in error messages/source
    locations identified by clang.

    The values should be the corresponding prefix to use in the documented
    :cpp:`#include` paths.
    """

    allow_paths: List[Pattern] = dataclasses.field(
        default_factory=lambda: [re.compile("")]
    )
    """List of regular expressions matching *allowed* paths.

    Only entities defined in files that match `.allow_paths`, and don't match
    `.disallow_paths`, are documented.  By default all entities are documented,
    but this default is not normally usable, because it will include entities
    defined in the standard library and third-party libraries.

    .. important::
        When building on Windows, all path separators are normalized to :python:`"/"`.
        Therefore, in the specified regular expressions, always use :python:`"/"` to
        match a path separator.
    """

    disallow_paths: List[Pattern] = dataclasses.field(default_factory=list)
    """List of regular expressions matching *disallowed* paths.

    Entities defined in files matching any of these patterns are not documented.

    .. important::
        When building on Windows, all path separators are normalized to :python:`"/"`.
        Therefore, in the specified regular expressions, always use :python:`"/"` to
        match a path separator.
    """

    disallow_namespaces: List[Pattern] = dataclasses.field(default_factory=list)
    """List of regular expressions matching *disallowed* namespaces.

    Entities defined in namespaces matching any of the specified patterns are
    not documented.
    """

    allow_symbols: List[Pattern] = dataclasses.field(
        default_factory=lambda: [re.compile("")]
    )
    """List of regular expressions matching *allowed* symbols.

    Only symbols matching one of the `.allow_symbols` patterns, and not matching
    `.disallow_symbols`, are documented.  By default, all symbols are allowed.
    """

    disallow_symbols: List[Pattern] = dataclasses.field(default_factory=list)
    """List of regular expressions matching *disallowed* symbols.

    Symbols matching any of these patterns are undocumented.
    """

    allow_macros: List[Pattern] = dataclasses.field(
        default_factory=lambda: [re.compile("")]
    )
    """List of regular expressions matching *allowed* macros.

    Only macros names matching `.allow_macros`, and not matching
    `.disallow_macros`, are documented.
    """

    disallow_macros: List[Pattern] = dataclasses.field(default_factory=list)
    """List of regular expressions matching *disallowed* macro names.

    Macros matching any of these patterns are undocumented.
    """

    ignore_diagnostics: List[Pattern] = dataclasses.field(default_factory=list)
    """List of regular expressions matching diagnostics to ignore.

    Diagnostics matching any of these patterns are ignored.
    """

    template_parameter_enable_if_patterns: List[Pattern] = dataclasses.field(
        default_factory=lambda: [
            TEMPLATE_PARAMETER_ENABLE_IF_TYPE_PATTERN,
            TEMPLATE_PARAMETER_ENABLE_IF_NON_TYPE_PATTERN,
        ]
    )

    type_replacements: Dict[str, str] = dataclasses.field(default_factory=dict)
    """Remaps type names."""

    hide_types: List[Pattern] = dataclasses.field(default_factory=list)
    """List of regular expressions matching *hidden* types.

    Matching return types are replaced with :cpp:`auto`, and matching
    initializers are elided.
    """
    ignore_template_parameters: List[Pattern] = dataclasses.field(default_factory=list)
    """List of regular expressions matching *ignored* template parameters.

    Template parameters with a declaration matching any of these patterns are
    excluded from the generated documentation.
    """

    hide_initializers: List[Pattern] = dataclasses.field(
        default_factory=lambda: [re.compile(r"^=\s*(?:(true|false)\s*$|\[)")]
    )
    """List of regular expressions matching initializers to elide.

    Any matching initializer expression is elided from the generated
    documentation.
    """

    # Derived from `allow_paths`.
    allow_path_pattern: Pattern = dataclasses.field(init=False)

    # Derived from `disallow_paths`.
    disallow_path_pattern: Pattern = dataclasses.field(init=False)

    # Derived from `allow_symbols`.
    allow_symbols_pattern: Pattern = dataclasses.field(init=False)

    # Derived from `disallow_symbols`.
    disallow_symbols_pattern: Pattern = dataclasses.field(init=False)

    # Derived from `allow_macros`.
    allow_macros_pattern: Pattern = dataclasses.field(init=False)

    # Derived from `disallow_macros`.
    disallow_macros_pattern: Pattern = dataclasses.field(init=False)

    # Derived from `ignore_diagnostics`.
    ignore_diagnostics_pattern: Pattern = dataclasses.field(init=False)

    # Derived from `hide_types`.
    hide_types_pattern: Pattern = dataclasses.field(init=False)

    # Derived from `type_replacements`.
    type_replacements_pattern: Pattern = dataclasses.field(init=False)

    # Derived from `ignore_template_parameters`.
    ignore_template_parameters_pattern: Pattern = dataclasses.field(init=False)

    # Derived from `hide_initializers`.
    hide_initializers_pattern: Pattern = dataclasses.field(init=False)

    include_directory_map_pattern: Pattern = dataclasses.field(init=False)

    disallow_namespaces_pattern: Pattern = dataclasses.field(init=False)

    def __post_init__(self):
        self.allow_path_pattern = _combine_regexp_list(self.allow_paths)  # type: ignore[misc]
        self.disallow_path_pattern = _combine_regexp_list(self.disallow_paths)  # type: ignore[misc]
        self.allow_path_pattern = _combine_regexp_list(self.allow_paths)  # type: ignore[misc]
        self.disallow_namespaces_pattern = _combine_regexp_list(
            self.disallow_namespaces
        )
        self.allow_symbols_pattern = _combine_regexp_list(self.allow_symbols)  # type: ignore[misc]
        self.disallow_symbols_pattern = _combine_regexp_list(self.disallow_symbols)  # type: ignore[misc]
        self.allow_macros_pattern = _combine_regexp_list(self.allow_macros)  # type: ignore[misc]
        self.disallow_macros_pattern = _combine_regexp_list(self.disallow_macros)  # type: ignore[misc]
        self.ignore_diagnostics_pattern = _combine_regexp_list(self.ignore_diagnostics)  # type: ignore[misc]
        self.hide_types_pattern = _combine_regexp_list(self.hide_types)  # type: ignore[misc]
        self.type_replacements_pattern = _make_replacement_pattern(  # type: ignore[misc]
            list(self.type_replacements.keys()), prefix=r"\b", suffix=r"\b"
        )
        self.ignore_template_parameters_pattern = _combine_regexp_list(  # type: ignore[misc]
            self.ignore_template_parameters
        )
        self.hide_initializers_pattern = _combine_regexp_list(self.hide_initializers)  # type: ignore[misc]
        if os.name == "nt":
            self.normalized_include_directory_map = {  # type: ignore[misc]
                key.replace("\\", "/"): value
                for key, value in self.include_directory_map.items()
            }
        else:
            self.normalized_include_directory_map = self.include_directory_map  # type: ignore[misc]
        self.include_directory_map_pattern = _make_replacement_pattern(  # type: ignore[misc]
            list(self.normalized_include_directory_map.keys()), prefix="^", suffix=""
        )
        self.cached_mapped_include_directories = {}  # type: ignore[misc]

    normalized_include_directory_map: Dict[str, str] = dataclasses.field(init=False)
    cached_mapped_include_directories: Dict[str, str] = dataclasses.field(init=False)

    def map_include_path(self, path: str) -> str:
        mapped = self.cached_mapped_include_directories.get(path)
        if mapped is not None:
            return mapped
        if os.name == "nt":
            path = path.replace("\\", "/")
        if path.startswith("./"):
            path = path[2:]
        new_mapped = self.include_directory_map_pattern.sub(
            lambda m: self.normalized_include_directory_map[m.group(0)], path
        )
        self.cached_mapped_include_directories[path] = new_mapped
        return new_mapped



EntityId = str
EntityKind = Literal[
    "class",
    "conversion_function",
    "function",
    "method",
    "constructor",
    "var",
    "alias",
    "enum",
]
FunctionEntityKind = Literal[
    "conversion_function", "function", "method", "constructor", "destructor"
]

ClassKeyword = Literal["class", "struct"]


class JsonLocation(TypedDict):
    file: str
    line: int
    col: int


class JsonDocComment(TypedDict):
    text: str
    location: JsonLocation


TemplateParameterKind = Literal["type", "template", "non_type"]


class TemplateParameter(TypedDict):
    declaration: str
    name: str
    kind: TemplateParameterKind
    pack: bool


class CppApiEntityBase(TypedDict, total=False):
    id: EntityId
    parent: NotRequired[EntityId]
    scope: NotRequired[str]
    doc: NotRequired[Optional[JsonDocComment]]
    document_with: NotRequired[EntityId]
    siblings: NotRequired[List[EntityId]]
    name: str
    template_parameters: NotRequired[Optional[List[TemplateParameter]]]
    location: JsonLocation
    special_id: NotRequired[Optional[str]]
    page_name: str
    requires: Optional[List[str]]
    specializes: Union[None, EntityId, Literal[True]]
    related_members: Dict[str, List[EntityId]]
    related_nonmembers: Dict[str, List[EntityId]]
    special_membergroup: str
    special_ingroup: str
    special_relates: str
    document_prefix: str
    nonitpick: List[str]


class FunctionEntity(CppApiEntityBase):
    kind: FunctionEntityKind
    arity: int
    name_substitute: str
    friend: bool
    declaration: str


class BaseClass(TypedDict):
    type: str
    access: str


class ClassEntity(CppApiEntityBase):
    kind: Literal["class"]
    keyword: ClassKeyword
    prefix: List[str]
    bases: List[BaseClass]


class VarEntity(CppApiEntityBase):
    kind: Literal["var"]
    declaration: str
    name_substitute: str
    initializer: Optional[str]


class TypeAliasEntity(CppApiEntityBase):
    kind: Literal["alias"]
    underlying_type: Optional[str]


class MacroEntity(CppApiEntityBase):
    kind: Literal["macro"]
    parameters: Optional[List[str]]


class EnumeratorEntity(TypedDict):
    kind: Literal["enumerator"]
    id: EntityId
    name: str
    decl: str
    doc: Optional[JsonDocComment]
    location: JsonLocation


class EnumEntity(CppApiEntityBase):
    kind: Literal["enum"]
    keyword: Optional[ClassKeyword]
    enumerators: List[EnumeratorEntity]


CppApiEntity = Union[
    ClassEntity, FunctionEntity, VarEntity, TypeAliasEntity, MacroEntity, EnumEntity
]


def json_location_to_string(location: Optional[JsonLocation]) -> Optional[str]:
    if location is None:
        return None
    return "%s:%s:%s" % (location["file"], location["line"], location["col"])


def get_entity_id(cursor: Cursor) -> EntityId:
    # USR workarounds from:
    # https://github.com/foonathan/cppast/blob/e558e2d58f519e3a83af770d460672b1d4ba2886/src/libclang/parse_functions.cpp#L13
    usr = cursor.get_usr()
    if cursor.kind in (CursorKind.FUNCTION_TEMPLATE, CursorKind.CONVERSION_FUNCTION):
        # Combine return type with USR to prevent collisions
        return f"{usr} {cursor.result_type.spelling}"
    if cursor.kind == CursorKind.CLASS_TEMPLATE_PARTIAL_SPECIALIZATION:
        # libclang issue: templ<T()> vs templ<T() &>
        # but identical USR
        # same workaround: combine display name with usr
        # (and hope this prevents all collisions...)
        return f"{usr} {cursor.displayname}"
    return usr


def _substitute_internal_type_names(config: Config, decl: str) -> str:
    return config.type_replacements_pattern.sub(
        lambda m: config.type_replacements[m.group(0)], decl
    )


def get_previous_line_location(tu, location: SourceLocation):
    file = location.file
    line = location.line
    return SourceLocation.from_position(tu, file, line - 1, 1)


def get_presumed_location(location: SourceLocation) -> typing.Tuple[str, int, int]:
    file, line, col = clang.cindex._CXString(), ctypes.c_uint(), ctypes.c_uint()
    clang.cindex.conf.lib.clang_getPresumedLocation(
        location, ctypes.byref(file), ctypes.byref(line), ctypes.byref(col)
    )
    return (clang.cindex._CXString.from_result(file), int(line.value), int(col.value))


_clang_getFileContents = clang.cindex.conf.lib.clang_getFileContents
_clang_getFileContents.restype = ctypes.c_void_p
_PyMemoryView_FromMemory = ctypes.pythonapi.PyMemoryView_FromMemory
_PyMemoryView_FromMemory.argtypes = (ctypes.c_char_p, ctypes.c_ssize_t, ctypes.c_int)
_PyMemoryView_FromMemory.restype = ctypes.py_object


def _get_file_contents(tu, f):
    size = ctypes.c_size_t()
    ptr = _clang_getFileContents(tu, f, ctypes.byref(size))
    buf = _PyMemoryView_FromMemory(ctypes.cast(ptr, ctypes.c_char_p), size.value, 0x100)
    return buf


def _get_template_cursor_kind(cursor: Cursor) -> CursorKind:
    return CursorKind.from_id(clang.cindex.conf.lib.clang_getTemplateCursorKind(cursor))


def _get_specialized_cursor_template(cursor: Cursor) -> typing.Optional[Cursor]:
    result = clang.cindex.conf.lib.clang_getSpecializedCursorTemplate(cursor)
    if result is None:
        return None

    # Newer versions of libclang return a Cursor object containing a
    # null pointer that must be checked separately.  Normally this
    # would be done using `Cursor.from_result` but that method has an
    # unstable signature across versions.
    if result == clang.cindex.conf.lib.clang_getNullCursor():
        return None
    return result


def _get_full_nested_name(cursor: typing.Optional[Cursor]) -> str:
    if cursor is None:
        return ""
    ancestors = []
    while True:
        if cursor.kind == CursorKind.TRANSLATION_UNIT:
            break
        if cursor.kind == CursorKind.NAMESPACE:
            name = cursor.spelling
        else:
            name = cursor.displayname
        ancestors.append(name + "::")
        cursor = cursor.semantic_parent
    ancestors.reverse()
    return "".join(ancestors)


CLASS_KINDS = (
    CursorKind.STRUCT_DECL,
    CursorKind.CLASS_DECL,
    CursorKind.CLASS_TEMPLATE,
    CursorKind.CLASS_TEMPLATE_PARTIAL_SPECIALIZATION,
)


def _get_all_decls(
    config: Config, cursor: Cursor, allow_file
) -> Iterator[tuple[Cursor, SourceLocation]]:
    NAMESPACE = CursorKind.NAMESPACE
    doc_comment_start_bound = cursor.location
    for child in cursor.get_children():
        location = child.location
        if location.file is None:
            continue
        prev_doc_comment_start_bound = doc_comment_start_bound
        doc_comment_start_bound = child.extent.end
        kind = child.kind
        if kind == NAMESPACE:
            if (
                not allow_file or allow_file(get_presumed_location(location)[0])
            ) and not config.disallow_namespaces_pattern.match(child.spelling):
                yield from _get_all_decls(config, child, None)
            continue
        if kind not in ALLOWED_KINDS:
            continue
        if allow_file and not allow_file(get_presumed_location(location)[0]):
            continue
        if child.kind == CursorKind.MACRO_DEFINITION:
            yield (child, prev_doc_comment_start_bound)
            continue
        yield (child, prev_doc_comment_start_bound)
        if kind in CLASS_KINDS:
            yield from _get_all_decls(config, child, None)


# Matches the start of a doc comment.
#
# This is used to test if an individual comment token is a doc comment.
_DOC_COMMENT_START = re.compile(
    r"""
    (?:
      //
      (?:/|!)
    )
    |
    (?:
      /\*
      (?:!|\*)
    )
    """,
    re.VERBOSE,
)

# Matches one or more doc comments with a "<" introducer to indicate that the
# doc comment applies to the entity before it, rather than the entity after it.
#
# This is used by `_get_raw_comments_after`.
_DOC_COMMENT_AFTER = re.compile(
    rb"""
    (
      \s*            # Skip leading whitespace
      (?:
        (
          //           # Comment introducer
          (?:/|!)<     # Doc comment indicator
          [^\r\n]*     # Comment text
          \r?          # Optionally ignored CR
          $            # End of comment line
        )
        |
        (
          /\*         # Comment introducer
          (?:!|\*)<   # Doc comment indicator
          (?:.|\n)*?  # Comment text
          \*/         # Comment terminator
        )
      )
    )+
    """,
    re.MULTILINE | re.VERBOSE,
)


def _get_raw_comments(
    cursor: Cursor, doc_comment_start_bound: SourceLocation
) -> Optional[tuple[str, SourceLocation]]:
    # libclang exposes `cursor.raw_comment` but in some cases it appears to be
    # `None` even if there is in fact a comment. Instead, extract the comments
    # by searching for comment tokens directly.

    translation_unit = cursor.translation_unit

    if cursor.kind == CursorKind.MACRO_DEFINITION:
        # The extent for macro definitions skips the initial "#define". As a
        # workaround, set the end location to the beginning of the line.
        orig_location = cursor.location
        end_location = SourceLocation.from_position(
            translation_unit, orig_location.file, orig_location.line, 1
        )
    else:
        for token in cursor.get_tokens():
            end_location = token.location
            break
        else:
            end_location = cursor.location

    if (
        doc_comment_start_bound.file is None
        or end_location.file is None
        or doc_comment_start_bound.file.name != end_location.file.name  # type: ignore[attr-defined]
    ):
        doc_comment_start_bound = SourceLocation.from_offset(
            translation_unit, end_location.file, 0
        )

    tokens = list(
        translation_unit.get_tokens(
            extent=SourceRange.from_locations(doc_comment_start_bound, end_location)
        )
    )

    tokens.reverse()

    COMMENT = TokenKind.COMMENT
    comment_tokens: list[Token] = []
    for token in tokens:
        token_location = token.extent.end
        if token_location.file.name != end_location.file.name:  # type: ignore[attr-defined]
            break
        if token_location.line < end_location.line - 1:
            break
        if token_location.offset >= end_location.offset:
            continue
        if token.kind != COMMENT:
            break
        end_location = token_location
        comment_tokens.append(token)

    if not comment_tokens:
        return None

    comment_tokens.reverse()
    # Convert comment tokens back into a string, preserving indentation and line
    # breaks.

    comment_text_parts = []
    prev_line = None
    prev_indent = 0
    first_doc_comment_token_i = -1
    doc_comment_end_part_i = 0
    for token_i, token in enumerate(comment_tokens):
        spelling = token.spelling
        is_doc_comment = _DOC_COMMENT_START.match(spelling) is not None
        if first_doc_comment_token_i == -1:
            if not is_doc_comment:
                continue
            first_doc_comment_token_i = token_i
        token_location = token.location
        line = token_location.line
        if prev_line is not None and prev_line != line:
            comment_text_parts.append("\n")
        prev_line = line
        prev_indent = 0
        token_end_location = token.extent.end
        column = token_location.column
        extra_indent = column - prev_indent - 1
        if extra_indent > 0:
            comment_text_parts.append(" " * extra_indent)
        comment_text_parts.append(spelling)
        if is_doc_comment:
            doc_comment_end_part_i = len(comment_text_parts)
        prev_line = token_end_location.line
        prev_indent = token_end_location.column

    if not comment_text_parts:
        return None

    return (
        "".join(comment_text_parts[:doc_comment_end_part_i]),
        comment_tokens[first_doc_comment_token_i].location,
    )


def _get_raw_comments_after(
    tu, location: SourceLocation
) -> Optional[tuple[str, SourceLocation]]:
    buf = memoryview(_get_file_contents(tu, location.file))
    m = _DOC_COMMENT_AFTER.match(buf, location.offset + 1)
    if m is None:
        return None
    return (" " * (location.column - 1) + m.group(0).decode("utf-8") + "\n", location)


# Matches a single multi-line comment, a single-line non-doc comment, or a
# sequence of single-line same-style doc comments.
_COMMENT_PATTERN = re.compile(
    r"""
    (                  # "//" comment (capture group 1)
      [ \t]*           # Skip leading whitespace on first line
      //               # Comment introducer
      ((?:/|!)<?)?     # Optional doc comment indicator (capture group 2)
      [^\n]*           # Comment text
      \n               # End of first line.
      (?:              # Zero or more lines with the same doc comment indicator
        [ \t]*         # Skip leading whitspace
        //\2           # Comment introducer and doc comment indicator.
        [^\n]*         # Comment text
        \n             # End of comment line
      )*
    )
    |
    (                  # "/*" comment (capture group 3)
      [ \t]*           # Skip leading whitespace
      /\*              # Comment introducer
      ((?:\*|!)<?)     # Optional doc comment indicator (capture group 4)
      (?:.|\n)*?       # Comment text
      \*/              # Comment terminator
    )
    """,
    re.VERBOSE,
)


def _convert_raw_comment_into_doc_comment(raw_comment: str) -> str:
    # Eliminate CR characters
    raw_comment = raw_comment.replace("\r", "") + "\n"
    pos = 0
    parts: list[str] = []
    while (m := _COMMENT_PATTERN.match(raw_comment, pos)) is not None:
        pos = m.end(0)
        if not m.group(2) and not m.group(4):
            # Non-doc comment, replace with empty lines to preserve line number mapping
            parts.append("\n" * m.group(0).count("\n"))
            continue

        if m.group(1):
            # // comment
            without_comment_prefix = re.sub(
                r"^[ \t]*//" + re.escape(m.group(2)), "", m.group(0), flags=re.MULTILINE
            )
        else:
            # /* comment
            without_comment_prefix = (
                raw_comment[m.start(0) : m.start(4) - 2]
                + " " * (2 + len(m.group(4)))
                + raw_comment[m.end(4) : m.end(0) - 2]
            )
            # Check if every line is prefixed with an asterisk at the same
            # column as the initial "/*".
            orig_text = m.group(0)
            if re.fullmatch(r"([ \t]*)/\*[^\n]*(\n\1 \*[^\n]*)*(\s*\*/)?", orig_text):
                without_comment_prefix = re.sub(
                    r"^([ \t]*)\*", r"\1 ", without_comment_prefix, flags=re.MULTILINE
                )
        parts.append(dedent(without_comment_prefix))
    assert not raw_comment[pos:].strip(), "Unexpected syntax in raw comment"
    return "".join(parts).rstrip()


_CURSOR_KINDS_THAT_ALLOW_DOC_COMMENTS_AFTER = frozenset(
    [
        CursorKind.VAR_DECL,
        CursorKind.FIELD_DECL,
        # May be variable template.
        CursorKind.UNEXPOSED_DECL,
        CursorKind.TYPE_ALIAS_DECL,
        CursorKind.TYPEDEF_DECL,
        CursorKind.TYPE_ALIAS_TEMPLATE_DECL,
        CursorKind.ENUM_CONSTANT_DECL,
    ]
)


def _get_doc_comment(
    config: Config, cursor: Cursor, doc_comment_start_bound: SourceLocation
) -> Optional[JsonDocComment]:
    raw_comment = _get_raw_comments(cursor, doc_comment_start_bound)

    if (
        raw_comment is None
        and cursor.kind in _CURSOR_KINDS_THAT_ALLOW_DOC_COMMENTS_AFTER
    ):
        raw_comment = _get_raw_comments_after(
            cursor.translation_unit, cursor.extent.end
        )

    if raw_comment is None:
        return None

    raw_comment_text, comment_location = raw_comment
    comment_text = _convert_raw_comment_into_doc_comment(raw_comment_text)
    return {
        "text": comment_text,
        "location": _get_location_json(config, comment_location),
    }


class Extractor:
    def __init__(self, config: Config):
        self.config = config

        input_path = config.input_path

        input_content = config.input_content
        if input_content is None:
            input_content = pathlib.Path(input_path).read_bytes()

        input_content = re.sub(
            b"#pragma clang module", b"//#pragma clang module", input_content
        )

        self.input_source = input_content

        self.index = clang.cindex.Index.create()
        start_time = time.time()
        self.tu = self.index.parse(
            input_path,
            unsaved_files=[(input_path, input_content)],
            args=tuple(config.compiler_flags) + ("-ferror-limit=0",),
            options=(  # TranslationUnit.PARSE_SKIP_FUNCTION_BODIES +
                TranslationUnit.PARSE_DETAILED_PROCESSING_RECORD
            ),
        )
        end_time = time.time()
        if config.verbose:
            logger.info("Parsed C++ input in %.5f seconds", end_time - start_time)

        for diag in self.tu.diagnostics:
            if config.ignore_diagnostics_pattern.search(diag.spelling):
                if config.verbose:
                    logger.info(
                        diag.spelling,
                        location=_get_location_string(config, diag.location),
                    )
                continue
            logger.error(
                diag.spelling, location=_get_location_string(config, diag.location)
            )

        def _allow_file(path: str) -> bool:
            path = config.map_include_path(path)
            if not config.allow_path_pattern.search(path):
                return False
            if config.disallow_path_pattern.search(path):
                return False
            return True

        self.decls = list(
            _get_all_decls(
                config, self.tu.cursor, functools.lru_cache(maxsize=None)(_allow_file)
            )
        )


EXCLUDED_COMPILER_FLAGS = frozenset(
    [
        "-Xclang=-disable-noundef-analysis",
    ]
)


def _transform_type_alias_decl(config: Config, decl: Cursor):
    underlying_type: Optional[str] = _substitute_internal_type_names(
        config, decl.underlying_typedef_type.spelling
    )
    assert underlying_type is not None
    if config.hide_types_pattern.search(underlying_type):
        underlying_type = None

    return {
        "kind": "alias",
        "name": decl.spelling,
        "underlying_type": underlying_type,
    }


def get_extent_spelling(translation_unit: TranslationUnit, extent: SourceRange) -> str:
    """Returns the C++ source representation for the specified extent.

    Comments are excluded and for simplicity all tokens are separated by
    whitespace.  This results in excessive whitespace, but that does not matter
    because this is intended to be parsed by the Sphinx cpp domain anyway.
    """
    no_spaces = (
        (TokenKind.KEYWORD, TokenKind.PUNCTUATION),
        (TokenKind.IDENTIFIER, TokenKind.PUNCTUATION),
        (TokenKind.PUNCTUATION, TokenKind.KEYWORD),
        (TokenKind.PUNCTUATION, TokenKind.IDENTIFIER),
    )

    def get_spellings():
        prev_token = None
        COMMENT = TokenKind.COMMENT
        for token in translation_unit.get_tokens(extent=extent):
            if prev_token is not None:
                yield prev_token.spelling
                if (prev_token.kind, token.kind) not in no_spaces:
                    yield " "
                prev_token = None
            if token.kind == COMMENT:
                yield " "
                continue
            prev_token = token
        # We need to handle the last token specially, because clang sometimes parses
        # ">>" as a single token but the extent may cover only the first of the two
        # angle brackets.
        if prev_token is not None:
            yield " "
            spelling = prev_token.spelling
            token_end = cast(SourceLocation, prev_token.extent.end)
            offset_diff = token_end.offset - cast(SourceLocation, extent.end).offset
            if offset_diff != 0:
                yield spelling[:-offset_diff]
            else:
                yield spelling

    return "".join(get_spellings())


def get_related_comments(decl: Cursor):
    # casts below are workaround for: https://github.com/tgockel/types-clang/pull/2
    start = cast(SourceLocation, decl.extent.start)
    end = cast(SourceLocation, decl.extent.end)
    # Move forward one line to avoid skipping any comments on the last line
    tu = decl.translation_unit
    end = SourceLocation.from_position(tu, end.file, end.line + 1, 1)
    COMMENT = TokenKind.COMMENT
    for token in tu.get_tokens(
        extent=SourceRange.from_locations(cast(int, start), cast(int, end))
    ):
        if token.kind != COMMENT:
            continue
        yield token


NONITPICK_PATTERN = re.compile(r"//\s*NONITPICK:\s*(.*[^\s])\s*")


def get_nonitpick_directives(decl: Cursor) -> List[str]:
    directives = []
    for comment in get_related_comments(decl):
        text = comment.spelling
        m = NONITPICK_PATTERN.match(text)
        if m is None:
            continue
        directives.append(m.group(1))
    return directives


TEMPLATE_CURSOR_KINDS = frozenset(
    [
        CursorKind.FUNCTION_TEMPLATE,
        CursorKind.CLASS_TEMPLATE,
        CursorKind.CLASS_TEMPLATE_PARTIAL_SPECIALIZATION,
        CursorKind.TYPE_ALIAS_TEMPLATE_DECL,
    ]
)

TEMPLATE_PARAMETER_KIND_TO_JSON_KIND = {
    CursorKind.TEMPLATE_TYPE_PARAMETER: "type",
    CursorKind.TEMPLATE_NON_TYPE_PARAMETER: "non_type",
    CursorKind.TEMPLATE_TEMPLATE_PARAMETER: "template",
}


def _clang_template_parameter_to_json(config: Config, decl: Cursor):
    param_decl_str = get_extent_spelling(decl.translation_unit, decl.extent)
    param = _parse_template_parameter(param_decl_str)
    spelling = decl.spelling
    if param is None:
        return {
            "declaration": param_decl_str,
            "name": spelling,
            "kind": TEMPLATE_PARAMETER_KIND_TO_JSON_KIND[decl.kind],
            # Heuristic to determine if it is a pack.
            "pack": "..." in param_decl_str,
        }
    return _sphinx_ast_template_parameter_to_json(config, param)


def _get_template_parameters(config: Config, decl: Cursor):
    if decl.kind not in TEMPLATE_CURSOR_KINDS:
        return None
    result = []
    for child in decl.get_children():
        if child.kind not in (
            CursorKind.TEMPLATE_TYPE_PARAMETER,
            CursorKind.TEMPLATE_NON_TYPE_PARAMETER,
            CursorKind.TEMPLATE_TEMPLATE_PARAMETER,
        ):
            continue
        result.append(_clang_template_parameter_to_json(config, child))
    return result


def _get_non_template_kind(cursor: Cursor):
    kind = cursor.kind
    if kind not in TEMPLATE_CURSOR_KINDS:
        return kind
    return _get_template_cursor_kind(cursor)


def _transform_type_alias_template_decl(
    config: Config, decl: Cursor
) -> TypeAliasEntity:
    underlying_type: Optional[str]
    for child in decl.get_children():
        if child.kind == CursorKind.TYPE_ALIAS_DECL:
            underlying_type = _substitute_internal_type_names(
                config, child.underlying_typedef_type.spelling
            )
            break
    else:
        raise ValueError("Could not determine underlying type")

    requires = []

    if re.search(r"^\s*std\s*::\s*enable_if_t\s*<", underlying_type) is not None:
        presumed_file, presumed_line, _ = get_presumed_location(decl.location)
        parser = sphinx.domains.cpp.DefinitionParser(
            underlying_type,
            location=(presumed_file, presumed_line),
            config=cast(sphinx.config.Config, SphinxConfig()),
        )
        ast = parser._parse_type(False)
        parser.skip_ws()
        parser.assert_end()
        assert isinstance(ast, sphinx.domains.cpp.ASTType)
        requires_expr = _extract_requires_from_enable_if_t_type(config, ast)
        if requires_expr is not None:
            requires.append(requires_expr)
            underlying_type = str(ast)

    if config.hide_types_pattern.search(underlying_type) is not None:
        underlying_type = None
    return {
        "kind": "alias",
        "name": decl.spelling,
        "underlying_type": underlying_type,
        "requires": requires,
    }


def _get_class_keyword(kind: CursorKind) -> ClassKeyword:
    return "class" if kind == CursorKind.CLASS_DECL else "struct"


def _get_bases(config: Config, decl: Cursor):
    for child in decl.get_children():
        if child.kind != CursorKind.CXX_BASE_SPECIFIER:
            continue
        type_spelling = _substitute_internal_type_names(config, child.type.spelling)
        if config.hide_types_pattern.search(type_spelling) is not None:
            continue
        yield {"type": type_spelling, "access": child.access_specifier.name.lower()}


def _transform_class_decl(config: Config, decl: Cursor) -> ClassEntity:
    obj: ClassEntity = {
        "kind": "class",
        "keyword": _get_class_keyword(decl.kind),
        "name": decl.displayname,
        "prefix": _parse_declaration_prefix(decl, is_class=True),
        "bases": list(_get_bases(config, decl)),
    }
    specializes = _get_specialized_cursor_template(decl)
    if specializes:
        obj["specializes"] = get_entity_id(specializes)
    return obj


def _transform_class_template_decl(config: Config, decl: Cursor) -> ClassEntity:
    return {
        "kind": "class",
        "keyword": _get_class_keyword(_get_template_cursor_kind(decl)),
        "name": decl.spelling,
        "prefix": _parse_declaration_prefix(decl, is_class=True),
        "bases": list(_get_bases(config, decl)),
    }


def _transform_class_template_partial_specialization_decl(
    config: Config, decl: Cursor
) -> ClassEntity:
    return {
        "kind": "class",
        "keyword": _get_class_keyword(_get_template_cursor_kind(decl)),
        "name": decl.displayname,
        "specializes": get_entity_id(
            cast(Cursor, _get_specialized_cursor_template(decl))
        ),
        "prefix": _parse_declaration_prefix(decl, is_class=True),
        "bases": list(_get_bases(config, decl)),
    }


def _get_function_parameters(decl: Cursor):
    if decl.kind == CursorKind.FUNCTION_DECL:
        yield from decl.get_arguments()
        return
    for child in decl.get_children():
        if child.kind != CursorKind.PARM_DECL:
            continue
        yield child


FUNCTION_CURSOR_KIND_TO_JSON_KIND = {
    CursorKind.FUNCTION_DECL: "function",
    CursorKind.CXX_METHOD: "method",
    CursorKind.CONSTRUCTOR: "constructor",
    CursorKind.DESTRUCTOR: "destructor",
    CursorKind.CONVERSION_FUNCTION: "conversion_function",
}


def _parse_declaration_prefix(decl: Cursor, is_class: bool) -> typing.List[str]:
    decl_extent = decl.extent
    start_location = decl_extent.start
    end_location = None
    prefix_parts = []
    for child in decl.get_children():
        # Skip template introduction
        if child.kind in (
            CursorKind.TEMPLATE_TYPE_PARAMETER,
            CursorKind.TEMPLATE_NON_TYPE_PARAMETER,
            CursorKind.TEMPLATE_TEMPLATE_PARAMETER,
        ):
            start_location = child.extent.end
            continue
        if child.kind.is_attribute():
            attr_spelling = get_extent_spelling(decl.translation_unit, child.extent)
            prefix_parts.append(f"[[{attr_spelling}]]")
            continue
        end_location = child.extent.start
        break

    if not is_class:
        for token in decl.translation_unit.get_tokens(
            extent=SourceRange.from_locations(
                start_location, end_location or decl_extent.end
            )
        ):
            # skip `inline` since that is not an important part of the API
            if token.spelling in ("explicit", "constexpr"):
                prefix_parts.append(token.spelling)
    return prefix_parts


def _get_declaration_spelling(decl: Cursor) -> str:
    decl_extent = decl.extent
    start_location = decl_extent.start
    end_location = None
    for child in decl.get_children():
        if child.kind.is_statement():
            end_location = child.extent.start
            break
    else:
        end_location = decl_extent.end
    return get_extent_spelling(
        decl.translation_unit,
        extent=SourceRange.from_locations(start_location, end_location),
    )


def _transform_function_decl(config: Config, decl: Cursor):
    name = decl.spelling
    if name.startswith("<deduction guide for "):
        # Exclude deduction guides for now
        return None
    non_template_kind = decl.kind
    if decl.kind == CursorKind.FUNCTION_TEMPLATE:
        non_template_kind = _get_template_cursor_kind(decl)

    specializes = _get_specialized_cursor_template(decl)

    if non_template_kind == CursorKind.CONSTRUCTOR:
        # TODO: handle = default, = delete
        first_bracket = name.find("<")
        if first_bracket != -1:
            name = name[:first_bracket]

        prefix = _parse_declaration_prefix(decl, is_class=False)
        if decl.storage_class == clang.cindex.StorageClass.STATIC:
            prefix.insert(0, "static")
        source_code = _get_declaration_spelling(decl)
        name_substitute = _pick_name_substitute(source_code)
        decl_string = (
            "".join(x + " " for x in prefix)
            + name_substitute
            + "("
            + ", ".join(
                get_extent_spelling(decl.translation_unit, arg.extent)
                for arg in _get_function_parameters(decl)
            )
            + ")"
        )
        requires_expr = None
    else:
        (
            decl_string,
            bare_name,
            template_args,
            name_substitute,
            requires_expr,
        ) = _parse_function(config, decl)
        name = bare_name
        if specializes and template_args is not None:
            name += template_args

    arity = sum(x.kind == CursorKind.PARM_DECL for x in decl.get_children())

    obj = {
        "kind": FUNCTION_CURSOR_KIND_TO_JSON_KIND[non_template_kind],
        "name": name,
        "arity": arity,
        "declaration": decl_string,
        "name_substitute": name_substitute,
        "requires": [requires_expr] if requires_expr else None,
    }
    if specializes:
        obj["specializes"] = get_entity_id(specializes)
    return obj


def _transform_enum_decl(config: Config, decl: Cursor) -> EnumEntity:
    keyword = None
    tokens = list(decl.get_tokens())
    assert len(tokens) >= 2
    assert tokens[0].spelling == "enum"
    token1_spelling = tokens[1].spelling
    if token1_spelling in ("class", "struct"):
        keyword = cast(ClassKeyword, token1_spelling)

    enumerators: List[EnumeratorEntity] = []
    prev_decl_location = decl.location
    for child in decl.get_children():
        if child.kind == CursorKind.ENUM_CONSTANT_DECL:
            enumerators.append(
                {
                    "kind": "enumerator",
                    "id": get_entity_id(child),
                    "name": child.spelling,
                    "decl": get_extent_spelling(decl.translation_unit, child.extent),
                    "doc": _get_doc_comment(config, child, prev_decl_location),
                    "location": _get_location_json(config, child.location),
                }
            )
        prev_decl_location = child.extent.end
    return {
        "kind": "enum",
        "keyword": keyword,
        "name": decl.spelling,
        "enumerators": enumerators,
    }


def _pick_name_substitute(code: str) -> str:
    i = 0
    while True:
        substitute = f"__x{i}"
        if substitute not in code:
            return substitute
        i += 1


def _transform_var_decl(config: Config, decl: Cursor) -> VarEntity:
    exprs = [x for x in decl.get_children() if x.kind.is_expression()]
    presumed_filename, presumed_line, _ = get_presumed_location(decl.location)
    if len(exprs) > 1:
        raise ValueError(
            "%s:%d: Expected VAR decl to have at most one expression as a child: %r, but has: %d"
            % (presumed_filename, presumed_line, decl.spelling, len(exprs))
        )

    prefix = _parse_declaration_prefix(decl, is_class=False)

    type_spelling = decl.type.spelling
    if "(lambda at " in type_spelling:
        type_spelling = "auto"
    name_substitute = _pick_name_substitute(type_spelling)

    initializer = None
    if len(exprs) == 1:
        initializer = "= " + get_extent_spelling(
            decl.translation_unit, exprs[0].extent
        ).rstrip(";")
        if _is_internal_initializer(config, initializer):
            initializer = None
    declaration = " ".join(prefix) + " " + type_spelling + " " + name_substitute
    return {
        "kind": "var",
        "name": decl.spelling,
        "declaration": declaration,
        "name_substitute": name_substitute,
        "initializer": initializer,
    }


class SphinxConfig:
    cpp_id_attributes: Any = []
    cpp_paren_attributes: Any = []


def _parse_name(name: str, template_prefix: str) -> sphinx.domains.cpp.ASTNestedName:
    parser = sphinx.domains.cpp.DefinitionParser(
        f"{template_prefix} int {name}",
        location=("", 0),
        config=cast(sphinx.config.Config, SphinxConfig()),
    )
    ast = parser.parse_declaration("member", "member")
    parser.skip_ws()
    parser.assert_end(allowSemicolon=True)
    return ast.name


def _substitute_name(
    top_ast: sphinx.domains.cpp.ASTDeclaration,
    ast: sphinx.domains.cpp.ASTType,
    source_code: str,
) -> str:
    name_substitute = _pick_name_substitute(source_code)
    template_args = ast.name.names[-1].templateArgs
    name_substitute_with_args = name_substitute
    if template_args is not None:
        name_substitute_with_args += str(template_args)

    template_prefix = ""
    if (
        top_ast.templatePrefix is not None
        and top_ast.templatePrefix.templates is not None
    ):
        template_prefix = str(top_ast.templatePrefix.templates[-1])

    ast.name = _parse_name(name_substitute_with_args, template_prefix=template_prefix)
    return name_substitute_with_args


def _maybe_wrap_requires_expr_in_parentheses(expr: str) -> str:
    parser = sphinx.domains.cpp.DefinitionParser(
        "requires " + expr,
        location=("", 0),
        config=cast(sphinx.config.Config, SphinxConfig()),
    )
    try:
        parser._parse_requires_clause()
        parser.skip_ws()
        parser.assert_end()
        return expr
    except Exception:
        return f"({expr})"


def _extract_requires_from_enable_if_t_type(
    config: Config, ast: sphinx.domains.cpp.ASTType
) -> typing.Optional[str]:
    if not isinstance(
        ast.declSpecs.trailingTypeSpec, sphinx.domains.cpp.ASTTrailingTypeSpecName
    ):
        return None
    decl_specs = ast.declSpecs
    trailing_type_spec = decl_specs.trailingTypeSpec
    if not str(trailing_type_spec).startswith("std::enable_if_t<"):
        return None
    template_args = trailing_type_spec.name.names[1].templateArgs.args  # type: ignore[attr-defined]
    requires_expr = str(template_args[0])
    if len(template_args) == 2:
        result_type = str(template_args[1])
    else:
        result_type = "void"

    parser = sphinx.domains.cpp.DefinitionParser(
        result_type, location=("", 0), config=cast(sphinx.config.Config, SphinxConfig())
    )
    new_ast = parser._parse_type(False)
    parser.skip_ws()
    parser.assert_end()

    new_decl_specs = new_ast.declSpecs

    def copy_qualifiers(
        orig_d: sphinx.domains.cpp.ASTDeclarator,
        new_d: sphinx.domains.cpp.ASTDeclarator,
    ):
        if isinstance(new_d, sphinx.domains.cpp.ASTDeclaratorRef):
            return sphinx.domains.cpp.ASTDeclaratorRef(
                next=copy_qualifiers(orig_d, new_d.next), attrs=new_d.attrs
            )
        if isinstance(new_d, sphinx.domains.cpp.ASTDeclaratorPtr):
            return sphinx.domains.cpp.ASTDeclaratorPtr(
                next=copy_qualifiers(orig_d, new_d.next),
                volatile=new_d.volatile,
                const=new_d.const,
                attrs=new_d.attrs,
            )
        return orig_d

    ast.decl = copy_qualifiers(ast.decl, new_ast.decl)

    decl_specs.trailingTypeSpec = new_decl_specs.trailingTypeSpec
    decl_specs.leftSpecs.const = (
        decl_specs.leftSpecs.const or new_decl_specs.leftSpecs.const
    )
    decl_specs.leftSpecs.volatile = (
        decl_specs.leftSpecs.volatile or new_decl_specs.leftSpecs.volatile
    )
    decl_specs.rightSpecs.const = (
        decl_specs.rightSpecs.const or new_decl_specs.rightSpecs.const
    )
    decl_specs.rightSpecs.volatile = (
        decl_specs.rightSpecs.volatile or new_decl_specs.rightSpecs.volatile
    )

    return _substitute_internal_type_names(config, requires_expr)


_FUNCTION_NAME_REPLACEMENTS = {
    "operator[ ]": "operator[]",
    "operator( )": "operator()",
}


def _parse_function(config: Config, decl: Cursor):
    presumed_file, presumed_line, _ = get_presumed_location(decl.location)
    source_code = _get_declaration_spelling(decl)
    parser = sphinx.domains.cpp.DefinitionParser(
        source_code,
        location=(presumed_file, presumed_line),
        config=cast(sphinx.config.Config, SphinxConfig()),
    )
    ast = parser.parse_declaration("function", "function")
    parser.skip_ws()
    parser.assert_end(allowSemicolon=True)
    assert isinstance(ast.declaration, sphinx.domains.cpp.ASTType)

    requires_expr = _extract_requires_from_enable_if_t_type(config, ast.declaration)

    last_name_element = ast.declaration.name.names[-1]

    bare_name = str(last_name_element.identOrOp)
    bare_name = _FUNCTION_NAME_REPLACEMENTS.get(bare_name, bare_name)
    template_args = last_name_element.templateArgs

    template_args_str = str(template_args) if template_args is not None else None

    name_substitute = _substitute_name(ast, ast.declaration, source_code)

    # Exclude `inline` specifier since it isn't really informative in API
    # documentation.
    ast.declaration.declSpecs.leftSpecs.inline = False

    decl_string = _substitute_internal_type_names(config, str(ast.declaration))
    return decl_string, bare_name, template_args_str, name_substitute, requires_expr


def _is_internal_initializer(config: Config, initializer: str) -> bool:
    return (
        config.hide_initializers_pattern.search(initializer) is not None
        or config.hide_types_pattern.search(initializer) is not None
    )


def _sphinx_ast_template_parameter_to_json(
    config: Config, param: sphinx.domains.cpp.ASTTemplateParam
) -> TemplateParameter:
    if isinstance(param, sphinx.domains.cpp.ASTTemplateParamType):
        kind = "type"
    elif isinstance(param, sphinx.domains.cpp.ASTTemplateParamTemplateType):
        kind = "template"
    else:
        kind = "non_type"

    identifier = param.get_identifier()

    return {
        "declaration": _substitute_internal_type_names(config, str(param)),
        "name": str(identifier) if identifier else "",
        "kind": cast(TemplateParameterKind, kind),
        "pack": param.isPack,  # type: ignore[attr-defined]
    }


def _transform_unexposed_decl(config: Config, decl: Cursor) -> Optional[VarEntity]:
    # libclang unfortunately does not support variable templates; they are only
    # exposed as an unexposed decl.

    source_code = get_extent_spelling(decl.translation_unit, decl.extent)
    if not re.search(r"^\s*template\s*<", source_code):
        return None

    # Assume that it is a variable template
    # Attempt to parse it via sphinx's c++ domain parser

    presumed_file, presumed_line, _ = get_presumed_location(decl.location)

    try:
        parser = sphinx.domains.cpp.DefinitionParser(
            source_code,
            location=(presumed_file, presumed_line),
            config=cast(sphinx.config.Config, SphinxConfig()),
        )
        ast = parser.parse_declaration("member", "member")
        parser.skip_ws()
        parser.assert_end(allowSemicolon=True)

        declaration = cast(
            Union[
                sphinx.domains.cpp.ASTTypeWithInit,
                sphinx.domains.cpp.ASTTemplateParamConstrainedTypeWithInit,
            ],
            ast.declaration,
        )
        template_args = declaration.type.name.names[-1].templateArgs
        name = str(declaration.type.name.names[-1])
        name_substitute = _substitute_name(ast, declaration.type, source_code)

        decl_string = _substitute_internal_type_names(config, str(declaration.type))
        decl_string = re.sub("(^| )inline ", " ", decl_string)

        initializer: Optional[str] = _substitute_internal_type_names(
            config, str(declaration.init).strip().rstrip(";").strip()
        )
        assert initializer is not None
        if _is_internal_initializer(config, initializer):
            initializer = None
        template_params = []
        templates = cast(
            sphinx.domains.cpp.ASTTemplateDeclarationPrefix, ast.templatePrefix
        ).templates
        assert templates is not None
        for templ_param in templates[-1].params:
            template_params.append(
                _sphinx_ast_template_parameter_to_json(
                    config, cast(sphinx.domains.cpp.ASTTemplateParam, templ_param)
                )
            )
        obj: VarEntity = {
            "kind": "var",
            "name": name,
            "template_parameters": template_params,
            "declaration": decl_string,
            "name_substitute": name_substitute,
            "initializer": initializer,
        }
        if template_args is not None:
            obj["specializes"] = True
        return obj

    except Exception as e:
        raise ValueError("Failed to parse unexposed") from e
    return None


def _parse_macro_parameters(decl: Cursor) -> typing.Optional[typing.List[str]]:
    # Check if the macro is a function-like macro
    # `cast` below is workaround for: https://github.com/tgockel/types-clang/pull/2
    token_iterator = cast(typing.Iterator[Token], decl.get_tokens())
    first_token = next(token_iterator)
    assert first_token.spelling == decl.spelling
    def_start_offset = cast(SourceLocation, first_token.extent.end).offset
    try:
        next_token = next(token_iterator)
    except StopIteration:
        return None
    if next_token.spelling != "(":
        return None
    if next_token.location.offset != def_start_offset:
        # There is a space between the macro name and the first "(", which means
        # this is not a function-like macro.
        return None

    parameters = []
    for token in token_iterator:
        if token.kind == TokenKind.COMMENT:
            continue
        spelling = token.spelling
        if spelling == ")":
            break
        if spelling == ",":
            continue
        parameters.append(spelling)
    else:
        presumed_file, presumed_line, _ = get_presumed_location(decl.location)
        raise ValueError(
            "%s:%d: Failed to parse macro %s"
            % (presumed_file, presumed_line, decl.spelling)
        )
    return parameters


def _transform_macro(config: Config, decl: Cursor) -> Optional[MacroEntity]:
    name = decl.spelling
    if config.disallow_macros_pattern.search(name) is not None:
        return None
    return {
        "kind": "macro",
        "name": name,
        "parameters": _parse_macro_parameters(decl),
    }


TRANSFORMERS: Dict[CursorKind, Callable[[Config, Cursor], Optional[CppApiEntity]]] = {
    CursorKind.TYPE_ALIAS_DECL: _transform_type_alias_decl,
    CursorKind.TYPEDEF_DECL: _transform_type_alias_decl,
    CursorKind.TYPE_ALIAS_TEMPLATE_DECL: _transform_type_alias_template_decl,
    CursorKind.STRUCT_DECL: _transform_class_decl,
    CursorKind.CLASS_DECL: _transform_class_decl,
    CursorKind.CLASS_TEMPLATE: _transform_class_template_decl,
    CursorKind.CLASS_TEMPLATE_PARTIAL_SPECIALIZATION: _transform_class_template_partial_specialization_decl,
    CursorKind.FUNCTION_DECL: _transform_function_decl,
    CursorKind.CXX_METHOD: _transform_function_decl,
    CursorKind.CONVERSION_FUNCTION: _transform_function_decl,
    CursorKind.CONSTRUCTOR: _transform_function_decl,
    CursorKind.DESTRUCTOR: _transform_function_decl,
    CursorKind.FUNCTION_TEMPLATE: _transform_function_decl,
    CursorKind.ENUM_DECL: _transform_enum_decl,
    CursorKind.VAR_DECL: _transform_var_decl,
    CursorKind.FIELD_DECL: _transform_var_decl,
    CursorKind.UNEXPOSED_DECL: _transform_unexposed_decl,
    CursorKind.MACRO_DEFINITION: _transform_macro,
}

ALLOWED_KINDS = frozenset(list(TRANSFORMERS.keys()) + [CursorKind.FRIEND_DECL])


def _parse_args(output_required: bool):
    ap = argparse.ArgumentParser()
    ap.add_argument("--config", type=str, required=True)
    ap.add_argument("--output", type=str, required=output_required)
    return ap.parse_args()


def _merge_decl_json(existing_json, new_json):
    if existing_json["doc"] and new_json["doc"]:
        raise ValueError("Duplicate doc strings: %r and %r" % (existing_json, new_json))
    existing_json["doc"] = existing_json["doc"] or new_json["doc"]
    template_parameters = existing_json.get("template_parameters")
    if template_parameters:
        new_template_parameters = new_json.get("template_parameters")
        for i, old_param in enumerate(template_parameters):
            new_param = new_template_parameters[i]
            if new_param.startswith(old_param):
                template_parameters[i] = new_param
            elif not old_param.startswith(new_param):
                raise ValueError(
                    "Conflicting template parameter %d: %r and %r"
                    % (i, existing_json, new_json)
                )


def _get_location_json(config: Config, location: SourceLocation) -> JsonLocation:
    filename, line, col = get_presumed_location(location)
    filename = config.map_include_path(filename)
    return {"file": filename, "line": line, "col": col}


def _get_location_string(config: Config, location: SourceLocation) -> str:
    filename, line, col = get_presumed_location(location)
    filename = config.map_include_path(filename)
    return f"{filename}:{line}:{col}"


def _is_immediately_after(decl: Cursor, prev_decl: Cursor) -> bool:
    # casts below are workaround for: https://github.com/tgockel/types-clang/pull/2
    prev_end = cast(SourceLocation, prev_decl.extent.end)
    cur_start = cast(SourceLocation, decl.extent.start)
    cur_file, cur_line, _ = get_presumed_location(cur_start)
    prev_file, prev_line, _ = get_presumed_location(prev_end)
    return cur_file == prev_file and cur_line == prev_line + 1


_NORMALIZED_KIND = {
    "constructor": "function",
    "conversion_function": "function",
    "method": "function",
    "function": "function",
    "class": "class",
    "alias": "alias",
    "enum": "enum",
    "macro": "macro",
    "var": "var",
}


def _kinds_are_compatible(a: str, b: str) -> bool:
    return _NORMALIZED_KIND[a] == _NORMALIZED_KIND[b]


class JsonApiGenerator:
    def __init__(self, extractor):
        self.extractor = extractor
        self.config = extractor.config
        self.seen_decls = {}
        self.output_json = []
        self._prev_decl = None
        self._document_with_parent = {}
        self._seen_unexposed_entities: set[tuple[str, int, str]] = set()

    def _resolve_document_with(self, entity_id: EntityId) -> EntityId:
        while True:
            document_with_parent = self._document_with_parent.get(entity_id)
            if document_with_parent is None:
                break
            entity_id = document_with_parent
        return entity_id

    def _transform_cursor_to_json(
        self,
        decl: Cursor,
        parent: Optional[Cursor],
        doc_comment_start_bound: SourceLocation,
    ):
        doc = _get_doc_comment(self.config, decl, doc_comment_start_bound)
        document_with = None
        location = _get_location_json(self.config, decl.location)
        if not doc:
            if self._prev_decl is not None and _is_immediately_after(
                decl, self._prev_decl[0]
            ):
                document_with = self._resolve_document_with(self._prev_decl[1]["id"])
            else:
                # Exclude undocumented entities
                return None
        else:
            if (
                self._prev_decl is not None
                and self._prev_decl[1]["location"] == location
            ):
                # Same line as previous declaration, presumably due to macro expansion
                # generating multiple declarations.
                #
                # Document as a sibling of the previous declaration.
                document_with = self._resolve_document_with(self._prev_decl[1]["id"])
        transformer = TRANSFORMERS.get(decl.kind)
        if transformer is None:
            return None
        json_repr = transformer(self.config, decl)
        if json_repr is None:
            return None
        if parent is None or parent.kind in (
            CursorKind.NAMESPACE,
            CursorKind.TRANSLATION_UNIT,
        ):
            json_repr["scope"] = _get_full_nested_name(parent)
        else:
            json_repr["parent"] = get_entity_id(parent)
        if decl.kind != CursorKind.UNEXPOSED_DECL:
            template_parameters = _get_template_parameters(self.config, decl)
            if json_repr.get("specializes") and template_parameters is None:
                template_parameters = []
            json_repr["template_parameters"] = template_parameters

        # Exclude duplicate UNEXPOSED_DECL entities.
        #
        # Some versions of libclang seem to also generate an UNEXPOSED_DECL for
        # instantations of variable templates. These occur at the same source
        # location as the original declaration, and are assumed to always occur
        # after the original declaration.
        if decl.kind == CursorKind.UNEXPOSED_DECL:
            duplicate_key = (
                decl.location.file.name,  # type: ignore[attr-defined]
                decl.location.offset,
                json.dumps(json_repr),
            )
            if duplicate_key in self._seen_unexposed_entities:
                return None
            self._seen_unexposed_entities.add(duplicate_key)

        entity_id = get_entity_id(decl)
        if document_with:
            prev_json = cast(Any, self._prev_decl)[1]
            if (
                prev_json is None
                or not _kinds_are_compatible(prev_json["kind"], json_repr["kind"])
                or prev_json.get("parent") != json_repr.get("parent")
                or prev_json.get("scope") != json_repr.get("scope")
            ):
                if not doc:
                    # Undocumented and can't document with previous decl
                    return None
                document_with = None
            if document_with is not None:
                doc = None
                self._document_with_parent[entity_id] = document_with
                json_repr["document_with"] = document_with
        json_repr["location"] = location
        nonitpick = get_nonitpick_directives(decl)
        if nonitpick:
            json_repr["nonitpick"] = nonitpick
        json_repr["doc"] = doc
        json_repr["id"] = entity_id
        return json_repr

    def add(self, decl: Cursor, doc_comment_start_bound: SourceLocation):
        is_friend = False
        if decl.kind == CursorKind.FRIEND_DECL:
            # Check if this is a hidden friend function.
            children = list(decl.get_children())
            if len(children) != 1:
                return
            decl = children[0]
            if not decl.kind.is_declaration():
                return
            is_friend = True
            parent = decl.lexical_parent
        else:
            parent = decl.semantic_parent
        json_repr = self._transform_cursor_to_json(
            decl, parent, doc_comment_start_bound
        )
        if json_repr is None:
            self._prev_decl = None
            return
        json_repr["friend"] = is_friend
        parent_id = json_repr.get("parent")
        if parent_id is not None and parent_id not in self.seen_decls:
            # Parent is undocumented, skip.
            return
        self._prev_decl = (decl, json_repr)
        entity_id = json_repr["id"]
        existing_json_repr = self.seen_decls.get(entity_id)
        if existing_json_repr is not None:
            _merge_decl_json(existing_json_repr, json_repr)
            return
        self.seen_decls[entity_id] = json_repr


def _parse_template_parameter(
    decl: str,
) -> Optional[sphinx.domains.cpp.ASTTemplateParam]:
    # Note: We must include an extra trailing ">" because
    # `_parse_template_parameter` fails if the parameter is not followed by "," or
    # ">".
    parser = sphinx.domains.cpp.DefinitionParser(
        decl + ">", location=("", 0), config=cast(sphinx.config.Config, SphinxConfig())
    )
    parser.allowFallbackExpressionParsing = False
    try:
        param = parser._parse_template_parameter()
        assert parser.skip_string(">")
        parser.assert_end()
        return param
    except sphinx.domains.cpp.DefinitionError:
        return None


def _extract_sfinae_replacement(template_parameter: str) -> Optional[Tuple[str, str]]:
    param = _parse_template_parameter(template_parameter)
    if param is None:
        return None

    name = str(param.get_identifier())

    if not name.lower().startswith("sfinae"):
        return None

    if isinstance(param, sphinx.domains.cpp.ASTTemplateParamType):
        default_type = param.data.default
        if default_type is None:
            return None
        return (name, str(default_type))

    if isinstance(param, sphinx.domains.cpp.ASTTemplateParamNonType):
        default_value: Optional[sphinx.domains.cpp.ASTBase] = param.param.init
        if default_value is None:
            return None
        if isinstance(default_value, sphinx.domains.cpp.ASTInitializer):
            default_value = default_value.value
        return (name, str(default_value))

    return None


CONDITIONALLY_EXPLICIT_PATTERN = re.compile(r"\(ExplicitRequires\((.+)\)\)")


def _match_template_parameter_enable_if_pattern(
    config: Config, decl: str
) -> Optional[str]:
    for pattern in config.template_parameter_enable_if_patterns:
        m = pattern.fullmatch(decl)
        if m is not None:
            return m.group(1)
    return None


def _transform_template_parameters(config: Config, template_parameters: List[Any]):
    """Transforms template parameters to C++20 form."""

    requires = []
    new_template_parameters = []
    replacements: Dict[str, str] = {}

    for template_parameter in template_parameters:
        decl = template_parameter["declaration"]
        requires_expr = _match_template_parameter_enable_if_pattern(config, decl)
        if requires_expr is not None:
            requires.append(requires_expr)
            continue
        if config.ignore_template_parameters_pattern.fullmatch(decl):
            continue
        # If the template parameter is of the form `YYY SfinaeXXX = Condition`, then
        # we want to exclude it from the template parameter list and instead return
        # the substitution `{"SfinaeXXX": "Condition"}`.  To avoid parsing in cases
        # that can't possibly match, first look to see if the name starts with
        # `"sfinae"`.
        if re.match(r"sfinae", template_parameter["name"], re.IGNORECASE) is not None:
            # Possibly match, parse to verify.
            replacement = _extract_sfinae_replacement(decl)
            if replacement is not None:
                replacements[replacement[0]] = replacement[1]
                continue
        new_template_parameters.append(template_parameter)

    return (
        new_template_parameters,
        requires,
        replacements,
    )


def _strip_return_type(
    declaration: str, template_prefix: str, location: Tuple[str, int]
) -> str:
    parser = sphinx.domains.cpp.DefinitionParser(
        template_prefix + declaration,
        location=location,
        config=cast(sphinx.config.Config, SphinxConfig()),
    )
    ast = parser.parse_declaration("function", "function")
    parser.skip_ws()
    parser.assert_end()
    assert isinstance(ast.declaration, sphinx.domains.cpp.ASTType)
    ast.declaration.declSpecs.trailingTypeSpec = (
        sphinx.domains.cpp.ASTTrailingTypeSpecFundamental(["auto"], ["auto"])
    )
    return str(ast.declaration)


_OPERATOR_PAGE_NAMES = {
    ("operator+", 1): "operator-unary_plus",
    ("operator-", 1): "operator-negate",
    ("operator*", 1): "operator-dereference",
    ("operator~", 1): "operator-complement",
    ("operator!", 1): "operator-logical_not",
    ("operator++", 1): "operator-pre_inc",
    ("operator++", 2): "operator-post_inc",
    ("operator--", 1): "operator-pre_dec",
    ("operator--", 2): "operator-post_dec",
    ("operator<<", 2): "operator-shift_left",
    ("operator>>", 2): "operator-shift_right",
    ("operator+", 2): "operator-plus",
    ("operator-", 2): "operator-minus",
    ("operator*", 2): "operator-multiplies",
    ("operator/", 2): "operator-divides",
    ("operator%", 2): "operator-modulus",
    ("operator<", 2): "operator-less",
    ("operator<=", 2): "operator-less_equal",
    ("operator>=", 2): "operator-greater_equal",
    ("operator==", 2): "operator-equal_to",
    ("operator!=", 2): "operator-not_equal_to",
    ("operator=", 2): "operator-assign",
    ("operator<<=", 2): "operator-shift_left_assign",
    ("operator>>=", 2): "operator-shift_right_assign",
    ("operator*=", 2): "operator-multiplies_assign",
    ("operator/=", 2): "operator-divides_assign",
    ("operator%=", 2): "operator-modulus_assign",
    ("operator+=", 2): "operator-plus_assign",
    ("operator-=", 2): "operator-minus_assign",
    ("operator&=", 2): "operator-bitwise_and_assign",
    ("operator|=", 2): "operator-bitwise_or_assign",
    ("operator^=", 2): "operator-bitwise_xor_assign",
    ("operator&&", 2): "operator-logical_and",
    ("operator||", 2): "operator-logical_or",
    ("operator|", 2): "operator-bitwise_or",
    ("operator&", 2): "operator-bitwise_and",
    ("operator^", 2): "operator-bitwise_xor",
    ("operator,", 2): "operator-comma",
    ("operator->", 1): "operator-arrow",
    ("operator&", 1): "operator-address_of",
    "operator()": "operator-call",
    "operator[]": "operator-subscript",
}

DEFAULT_MEMBER_GROUP_FOR_MEMBER_ENTITY_TYPE = {
    "constructor": "Constructors",
    "destructor": "Constructors",
    "class": "Types",
    "method": "Methods",
    "enum": "Types",
    "alias": "Types",
    "conversion_function": "Conversion operators",
    "var": "Data members",
    "function": "Friend functions",
}

DEFAULT_MEMBER_GROUP_FOR_NON_MEMBER_ENTITY_TYPE = {
    "alias": "Related Types",
    "enum": "Related Types",
    "class": "Related Types",
    "function": "Related Functions",
    "var": "Related Constants",
    "macro": "Related Macros",
}


def _get_default_member_group(entity: CppApiEntity) -> str:
    if entity.get("parent"):
        return DEFAULT_MEMBER_GROUP_FOR_MEMBER_ENTITY_TYPE[entity["kind"]]
    return DEFAULT_MEMBER_GROUP_FOR_NON_MEMBER_ENTITY_TYPE[entity["kind"]]


def _normalize_doc_text(text: str) -> str:
    text = re.sub(r"^((?:\\|@)(?:brief|details)\s+)", "", text, flags=re.MULTILINE)
    text = re.sub(
        r"^(?:\\|@)(t?param)(\[(?:in|out|in,\sout)\])?\s+([a-zA-Z_][^ ]*)",
        ":\\1 \\3\\2:",
        text,
        flags=re.MULTILINE,
    )
    text = re.sub(
        r"^(?:\\|@)(error)\s+`([^`]+)`", ":\\1 \\2:", text, flags=re.MULTILINE
    )
    text = re.sub(
        r"^(?:\\|@)(returns?|pre|post|[ds]?checks|invariant|requires)(?: |\n )",
        ":\\1: ",
        text,
        flags=re.MULTILINE,
    )
    text = re.sub(r"^(?:\\|@)(retval)\s+(\S+)", ":\\1 \\2:", text, flags=re.MULTILINE)
    text = SPECIAL_GROUP_COMMAND_PATTERN.sub("", text)
    return text


FUNCTION_ENTITY_KINDS = frozenset(
    ["function", "method", "constructor", "destructor", "conversion_function"]
)


def _is_function(entity: CppApiEntity) -> bool:
    return entity["kind"] in FUNCTION_ENTITY_KINDS


def _get_path_component_from_special_id(special_id: str) -> str:
    special_id = re.sub("[^a-zA-Z0-9_]+", "-", special_id)
    return special_id.strip("-")


def _apply_identifier_replacements(
    requires_term: str, replacements: Dict[str, str]
) -> str:
    for orig_identifier, replacement in replacements.items():
        requires_term = re.sub(
            r"\b" + re.escape(orig_identifier) + r"\b", replacement, requires_term
        )
    return requires_term


def _make_explicit_conditional(decl: str, explicit: str) -> str:
    new_str = re.sub(r"\bexplicit\b", f"explicit({explicit})", decl, 1)
    if new_str == decl:
        raise ValueError(
            "Failed to insert explicit condition %r into: %s"
            % (
                explicit,
                decl,
            )
        )
    return new_str


def _is_uniform_binary_expr(
    expr: sphinx.domains.cpp.ASTBase, allowed_ops: Tuple[str, ...]
) -> bool:
    if not isinstance(expr, sphinx.domains.cpp.ASTBinOpExpr):
        return False
    return all(op in allowed_ops for op in expr.ops)


def _is_logical_and_expr(expr: sphinx.domains.cpp.ASTBase) -> bool:
    return _is_uniform_binary_expr(expr, ("&&", "and"))


def _is_primary_expr(expr: sphinx.domains.cpp.ASTBase) -> bool:
    return isinstance(
        expr,
        (
            sphinx.domains.cpp.ASTLiteral,
            sphinx.domains.cpp.ASTIdExpression,
            sphinx.domains.cpp.ASTThisLiteral,
        ),
    )


def _normalize_requires_terms(terms: List[str]) -> List[str]:
    if not terms:
        return terms
    expr = " && ".join(f"({term})" for term in terms)
    parser = sphinx.domains.cpp.DefinitionParser(
        "requires " + expr,
        location=("", 0),
        config=cast(sphinx.config.Config, SphinxConfig()),
    )
    # If we allow fallback parsing, we end up with incorrect parsing and a
    # spurious warning.
    parser.allowFallbackExpressionParsing = False
    ast = parser._parse_requires_clause()
    parser.skip_ws()
    parser.assert_end()

    assert ast is not None

    new_terms = []

    def process(
        expr: Union[sphinx.domains.cpp.ASTType, sphinx.domains.cpp.ASTExpression],
    ):
        while True:
            if isinstance(expr, sphinx.domains.cpp.ASTParenExpr):
                expr = expr.expr
                continue
            if isinstance(expr, sphinx.domains.cpp.ASTBinOpExpr) and not expr.ops:
                expr = expr.exprs[0]
                continue
            if (
                isinstance(expr, sphinx.domains.cpp.ASTPostfixExpr)
                and not expr.postFixes
            ):
                expr = expr.prefix
                continue
            break
        if _is_logical_and_expr(expr):
            for sub_expr in cast(sphinx.domains.cpp.ASTBinOpExpr, expr).exprs:
                process(sub_expr)
            return
        if _is_primary_expr(expr):
            new_terms.append(str(expr))
            return
        new_terms.append(f"({expr})")

    process(ast.expr)

    return new_terms


def _format_template_arguments(entity: CppApiEntity) -> str:
    if entity.get("specializes"):
        # Template arguments already included in `entity["name"]`.
        return ""
    template_parameters = entity.get("template_parameters")
    if not template_parameters:
        return ""
    strs = []
    for param in template_parameters:
        arg = param["name"]
        if not arg:
            continue
        if param["pack"]:
            arg += "..."
        strs.append(arg)
    args_str = ", ".join(strs)
    return f"<{args_str}>"


def _get_entity_base_page_name_component(entity: CppApiEntity) -> str:
    base_name = entity["name"]
    if (entity["kind"] == "class" or entity["kind"] == "var") and entity.get(
        "specializes"
    ):
        # Strip any template arguments
        base_name = re.sub("([^<]*).*", r"\1", base_name)
    elif entity["kind"] == "conversion_function":
        base_name = "operator-cast"
    elif entity["kind"] in ("function", "method") and re.match(
        r"operator\b", base_name
    ):
        arity = cast(FunctionEntity, entity)["arity"]
        if entity["kind"] == "method":
            arity += 1
        op_page_name = _OPERATOR_PAGE_NAMES.get((base_name, arity))
        if op_page_name is None:
            op_page_name = _OPERATOR_PAGE_NAMES[base_name]
        base_name = op_page_name

    return base_name


def _get_entity_page_name_component(entity: CppApiEntity) -> str:
    page_name = _get_entity_base_page_name_component(entity)
    special_id = entity.get("special_id")
    if special_id is not None:
        page_name += f"-{_get_path_component_from_special_id(special_id)}"
    return page_name


def _ensure_unique_page_names(
    entities_with_page_names: List[EntityId],
    entities: Dict[EntityId, CppApiEntity],
    warning,
) -> None:
    names: Dict[
        Tuple[Optional[str], Optional[str], str, Optional[str]], List[EntityId]
    ] = {}

    for entity_id in entities_with_page_names:
        entity = entities[entity_id]
        parent_id = entity.get("parent")
        special_id = entity.get("special_id")
        scope = entity.get("scope")
        base_name = _get_entity_base_page_name_component(entity)
        key = (parent_id, scope, base_name, special_id)
        names.setdefault(key, []).append(entity_id)

    for (parent_id, scope, base_name, special_id), entity_ids in names.items():
        if len(entity_ids) == 1:
            continue
        page_entities = [entities[entity_id] for entity_id in entity_ids]
        warning(
            "Disambiguating %d overloads of %s using numerical ids.  Definitions at %s",
            len(entity_ids),
            base_name,
            ", ".join(
                "%s:%d" % (entity["location"]["file"], entity["location"]["line"])
                for entity in page_entities
            ),
        )

        for i, entity in enumerate(page_entities):
            entity["special_id"] = str(i + 1)


class JsonDiagnostic(TypedDict):
    message: str
    location: Optional[JsonLocation]


class JsonNitpickExclusion(TypedDict):
    file: str
    line: int
    target: str


class JsonApiData(TypedDict):
    errors: List[JsonDiagnostic]
    warnings: List[JsonDiagnostic]
    nonitpick: List[JsonNitpickExclusion]
    groups: Dict[str, List[EntityId]]
    entities: Dict[str, CppApiEntity]


def organize_entities(
    config: Config, entities: Dict[EntityId, CppApiEntity]
) -> JsonApiData:
    errors: List[JsonDiagnostic] = []
    warnings: List[JsonDiagnostic] = []

    def error(msg: str, *args, location: Optional[JsonLocation] = None):
        errors.append({"message": msg % args, "location": location})

    def warning(msg: str, *args, location: Optional[JsonLocation] = None):
        warnings.append({"message": msg % args, "location": location})

    def _handle_document_with(entity: CppApiEntity) -> bool:
        document_with = entity.get("document_with")
        if document_with is None:
            return False
        sibling_entity: Optional[CppApiEntity] = entities.get(document_with)
        if sibling_entity is None:
            return False
        sibling_entity.setdefault("siblings", []).append(entity["id"])
        return True

    def _normalize_entity_requires(entity: CppApiEntity):
        template_parameters = entity.get("template_parameters")
        if template_parameters:
            (
                template_parameters,
                requires,
                replacements,
            ) = _transform_template_parameters(config, template_parameters)
            if entity.get("specializes") is None and not template_parameters:
                entity["template_parameters"] = None
            else:
                entity["template_parameters"] = template_parameters
        else:
            requires = []
            replacements = None
            explicit = None

        existing_requires = entity.get("requires")
        if existing_requires:
            requires = existing_requires + requires

        if _is_function(entity):
            func_entity = cast(FunctionEntity, entity)
            declaration = func_entity["declaration"]
            declaration = _substitute_internal_type_names(config, declaration)
            if replacements:
                declaration = _apply_identifier_replacements(declaration, replacements)
            if (
                func_entity["kind"] != "constructor"
                and config.hide_types_pattern.search(
                    declaration[: declaration.index(func_entity["name_substitute"])]
                )
                is not None
            ):
                declaration = _strip_return_type(
                    declaration,
                    "template <> " if template_parameters is not None else "",
                    location=(entity["location"]["file"], entity["location"]["line"]),
                )
            func_entity["declaration"] = declaration
        else:
            if replacements:
                for key in cast(
                    Tuple[Literal["declaration", "underlying_type"], ...],
                    ("declaration", "underlying_type"),
                ):
                    x = cast(Optional[str], entity.get(key, None))
                    if x is not None:
                        entity[key] = _apply_identifier_replacements(x, replacements)  # type: ignore[typeddict-item]

        if replacements:
            requires = [
                _apply_identifier_replacements(x, replacements) for x in requires
            ]

        requires = _normalize_requires_terms(requires)
        new_requires = []
        explicit = None
        for term in requires:
            m = CONDITIONALLY_EXPLICIT_PATTERN.fullmatch(term)
            if m is not None:
                if explicit is not None:
                    raise ValueError(
                        "cannot have more than one conditionally-explicit term"
                    )
                explicit = m.group(1)
            else:
                new_requires.append(term)
        requires = new_requires

        if explicit:
            if entity["kind"] != "constructor":
                raise ValueError(
                    "conditionally-explicit terms only valid on constructors"
                )
            entity["declaration"] = _make_explicit_conditional(
                entity["declaration"], explicit
            )

        requires = [x for x in requires if config.hide_types_pattern.search(x) is None]
        entity["requires"] = requires

    def get_entity_page_name(entity: CppApiEntity) -> str:
        components = []
        cur_entity = entity
        while True:
            components.append(_get_entity_page_name_component(cur_entity))
            parent_id = cur_entity.get("parent")
            if parent_id is None:
                break
            parent_entity = entities.get(parent_id)
            assert parent_entity is not None
            cur_entity = parent_entity
        components.reverse()
        page_name = (cur_entity["scope"] + "::".join(components)).replace("::", ".")
        return page_name

    def _parse_entity_doc(entity: CppApiEntity):
        doc = entity["doc"]
        if doc is None:
            if _handle_document_with(entity):
                return True
            return False
        doc_text = doc["text"]
        for m in SPECIAL_GROUP_COMMAND_PATTERN.finditer(doc_text):
            entity[cast(Literal["special_id"], "special_" + m.group(1))] = m.group(  # noqa: F821
                2
            ).strip()
        return True

    def get_entity_scope(entity: CppApiEntity) -> str:
        components = []
        cur_entity = entity
        while True:
            parent_id = cur_entity.get("parent")
            if parent_id is None:
                break
            parent_entity = entities.get(parent_id)
            assert parent_entity is not None
            cur_entity = parent_entity
            name_with_args = cur_entity["name"]
            if not cur_entity.get("specializes"):
                name_with_args += _format_template_arguments(cur_entity)
            components.append(name_with_args)
        components.reverse()
        if components:
            components.append("")
        return cur_entity.get("scope", "") + "::".join(components)

    def get_entity_object_name(entity: CppApiEntity) -> str:
        name = get_entity_scope(entity) + entity["name"]
        special_id = entity.get("special_id")
        if special_id:
            name += f"[{special_id}]"
        return name

    unspecialized_names: Dict[
        Tuple[Optional[EntityId], Optional[str], str], EntityId
    ] = {}

    names: Dict[str, EntityId] = {}

    def resolve_entity_name(
        scope: str, relative_entity_name: str
    ) -> Optional[EntityId]:
        if relative_entity_name.startswith("::"):
            resolved = relative_entity_name[2:]
            entity_id = names.get(resolved)
            if entity_id is None:
                return None
            return entity_id
        truncate_idx = len(scope)
        while True:
            full_name = scope[:truncate_idx] + relative_entity_name
            entity_id = names.get(full_name)
            if entity_id is not None:
                return entity_id
            if truncate_idx == 0:
                return None
            truncate_idx = scope.rfind("::", 0, truncate_idx - 2)
            if truncate_idx == -1:
                truncate_idx = 0
            else:
                truncate_idx = truncate_idx + 2

    must_resolve_specializes: List[CppApiEntity] = []

    all_nonitpick: List[JsonNitpickExclusion] = []

    def _handle_nitpick(entity: CppApiEntity, targets: List[str]) -> None:
        document_with = entity.get("document_with")
        if document_with:
            entity = entities[document_with]
        location: JsonLocation = entity["location"]
        filename: str = location["file"]
        line: int = location["line"]
        for target in targets:
            all_nonitpick.append({"file": filename, "line": line, "target": target})

    entities_with_page_names: List[EntityId] = []

    for entity in entities.values():
        specializes = entity.get("specializes")
        if (
            entity["kind"] == "var"
            and entity.get("template_parameters") is not None
            and specializes is None
        ):
            key = (entity.get("parent"), entity.get("scope"), entity["name"])
            entity_id = entity["id"]
            if unspecialized_names.setdefault(key, entity_id) != entity_id:
                other_entity_id = unspecialized_names[key]
                other_entity = entities[other_entity_id]
                raise ValueError(
                    "Duplicate unspecialized entity name: %r %r %r"
                    % (
                        key,
                        entity,
                        other_entity,
                    )
                )
        if specializes is True:
            must_resolve_specializes.append(entity)
        if not _parse_entity_doc(entity):
            continue
        _normalize_entity_requires(entity)
        nonitpick = entity.get("nonitpick")
        if nonitpick:
            _handle_nitpick(entity, nonitpick)
        if not entity["doc"]:
            continue
        entities_with_page_names.append(entity["id"])

    for entity in must_resolve_specializes:
        name = cast(str, entity["name"])
        name = name[: name.index("<")]
        unspecialized_key = (entity.get("parent"), entity.get("scope"), name)
        unspecialized_id = unspecialized_names.get(unspecialized_key)
        if unspecialized_id is not None:
            entity["specializes"] = unspecialized_id

    _ensure_unique_page_names(entities_with_page_names, entities, warning)
    for entity_id in entities_with_page_names:
        entity = entities[entity_id]
        names[get_entity_object_name(entity)] = entity_id
        entity["page_name"] = get_entity_page_name(entity)
        doc = entity["doc"]
        assert doc is not None
        doc["text"] = _normalize_doc_text(doc["text"])

    groups: Dict[str, List[EntityId]] = {}

    for entity in entities.values():
        entity_id = entity["id"]
        doc = entity["doc"]
        if doc is None:
            continue
        ingroup = entity.get("special_ingroup")
        relates_name = entity.get("special_relates")
        member_group = entity.get("special_membergroup")

        if (ingroup is not None) and (relates_name is not None):
            error(
                "Cannot specify both \\ingroup and \\relates for %r",
                entity,
                location=doc["location"],
            )
            continue

        if ingroup is not None:
            ingroup = docutils.nodes.make_id(ingroup)
            groups.setdefault(ingroup, []).append(entity_id)
            if member_group is not None:
                error(
                    "Cannot specify both \\ingroup and \\membergroup for %r",
                    entity,
                    location=doc["location"],
                )
            continue

        parent_id = entity.get("parent")
        if relates_name is not None:
            scope = get_entity_scope(entity)
            relates_id = resolve_entity_name(scope, relates_name)
            if relates_id is None:
                error(
                    "Cannot resolve entity reference %r from scope %r",
                    relates_name,
                    scope,
                    location=doc["location"],
                )
                continue
            parent_id = None
        else:
            if parent_id is None:
                warning(
                    "No group or relates specified for entity %s%s",
                    entity.get("scope"),
                    entity["name"],
                    location=doc["location"],
                )
                continue
            relates_id = parent_id

        if member_group is None:
            member_group = _get_default_member_group(entity)
        assert relates_id is not None
        entities[relates_id].setdefault(
            cast(
                Literal["related_members", "related_nonmembers"],
                "related_members" if parent_id is not None else "related_nonmembers",
            ),
            cast(Dict[str, List[EntityId]], {}),
        ).setdefault(member_group, []).append(entity_id)

    return {
        "entities": entities,
        "groups": groups,
        "errors": errors,
        "warnings": warnings,
        "nonitpick": all_nonitpick,
    }


def _get_output_json(extractor: Extractor) -> JsonApiData:
    generator = JsonApiGenerator(extractor)
    if extractor.config.verbose:
        logger.info("Found %d C++ declarations", len(extractor.decls))
    for decl, doc_comment_start_bound in extractor.decls:
        generator.add(decl, doc_comment_start_bound)
    return organize_entities(extractor.config, generator.seen_decls)


def generate_output(config: Config) -> JsonApiData:
    extractor = Extractor(config)
    return _get_output_json(extractor)


def _load_config(config_path: str) -> Config:
    config_content = pathlib.Path(config_path).read_text(encoding="utf-8")
    context: dict = {}
    exec(config_content, context)

    config = context["config"]
    assert isinstance(config, Config)
    return config


def main():
    args = _parse_args(output_required=True)
    config = _load_config(args.config)
    output_json = generate_output(config)

    if args.output is not None:
        with open(args.output, "w", encoding="utf-8") as f:
            json.dump(output_json, f)


if __name__ == "__main__":
    main()