Pular para conteúdo

API Python

A interface principal do projeto é a CLI.

A API Python abaixo documenta os módulos internos mais úteis para reuso e inspeção.

Cliente OAI-PMH

ojs_scrape.oaipmh

Cliente OAI-PMH para periódicos OJS.

OAIPMHError

Bases: Exception

Erro retornado pelo servidor OAI-PMH.

Source code in src/ojs_scrape/oaipmh.py
class OAIPMHError(Exception):
    """Erro retornado pelo servidor OAI-PMH."""

    def __init__(self, code: str, message: str):
        self.code = code
        self.message = message
        super().__init__(f"OAI-PMH error [{code}]: {message}")

OAIPMHParseError

Bases: Exception

Erro ao interpretar XML retornado por um endpoint OAI-PMH.

Source code in src/ojs_scrape/oaipmh.py
class OAIPMHParseError(Exception):
    """Erro ao interpretar XML retornado por um endpoint OAI-PMH."""

OAIPMHClient

Cliente para o protocolo OAI-PMH.

Source code in src/ojs_scrape/oaipmh.py
class OAIPMHClient:
    """Cliente para o protocolo OAI-PMH."""

    def __init__(self, base_url: str, delay: float = 1.0, timeout: float = 30.0):
        """Cria um cliente OAI-PMH.

        Args:
            base_url: URL base do periódico OJS ou URL direta do endpoint `/oai`.
            delay: intervalo mínimo entre requisições ao mesmo servidor.
            timeout: timeout de rede em segundos.
        """
        normalized = base_url.rstrip("/").removesuffix("/index")
        if normalized.endswith("/oai"):
            self.oai_url = normalized
            self.base_url = normalized.removesuffix("/oai")
        else:
            self.base_url = normalized
            self.oai_url = f"{self.base_url}/oai"

        self.delay = max(delay, 0.0)
        self.timeout = timeout
        self.session = requests.Session()
        self.session.headers["User-Agent"] = USER_AGENT
        self._journal: OJSJournal | None = None
        self._last_request_at = 0.0

    def __enter__(self) -> OAIPMHClient:
        return self

    def __exit__(self, *_exc_info: object) -> None:
        self.close()

    def close(self) -> None:
        """Fecha a sessão HTTP interna."""
        self.session.close()

    def _request(self, params: Mapping[str, str]) -> ET.Element:
        """Faz uma requisição OAI-PMH e retorna o elemento raiz do XML."""
        self._wait_rate_limit()
        response = self.session.get(self.oai_url, params=params, timeout=self.timeout)
        response.raise_for_status()
        self._last_request_at = time.monotonic()

        root = _parse_xml_content(response.content, response.url)

        error_elem = root.find("oai:error", NS)
        if error_elem is not None:
            code = error_elem.get("code", "unknown")
            message = (error_elem.text or "").strip()
            raise OAIPMHError(code, message)

        return root

    def _wait_rate_limit(self) -> None:
        if self.delay <= 0 or self._last_request_at == 0:
            return
        elapsed = time.monotonic() - self._last_request_at
        if elapsed < self.delay:
            time.sleep(self.delay - elapsed)

    def identify(self) -> OJSJournal:
        """Executa `Identify` e retorna informações do repositório."""
        if self._journal is not None:
            return self._journal

        root = self._request({"verb": "Identify"})
        identify = root.find("oai:Identify", NS)

        journal = OJSJournal(
            base_url=self.base_url,
            oai_base_url=self.oai_url,
            repository_name=_text(identify, "oai:repositoryName"),
            admin_email=_text(identify, "oai:adminEmail"),
            earliest_datestamp=_text(identify, "oai:earliestDatestamp"),
        )
        self._journal = journal
        return journal

    def list_sets(self) -> list[OAISet]:
        """Executa `ListSets` e retorna todos os sets disponíveis.

        Alguns repositórios OAI-PMH não implementam sets. Nesses casos retorna lista vazia.
        """
        sets: list[OAISet] = []
        params = {"verb": "ListSets"}

        while True:
            try:
                root = self._request(params)
            except OAIPMHError as exc:
                if exc.code == "noSetHierarchy":
                    return []
                raise

            for set_elem in root.findall(".//oai:set", NS):
                sets.append(
                    OAISet(
                        spec=_text(set_elem, "oai:setSpec"),
                        name=_text(set_elem, "oai:setName"),
                    )
                )

            token = _resumption_token(root)
            if token is None:
                break
            params = {"verb": "ListSets", "resumptionToken": token}

        return sets

    def list_records(
        self,
        metadata_prefix: str = "oai_dc",
        from_date: str | None = None,
        until_date: str | None = None,
        set_spec: str | None = None,
    ) -> Iterator[Article]:
        """Executa `ListRecords` e gera artigos via paginação."""
        params = _record_params("ListRecords", metadata_prefix, from_date, until_date, set_spec)

        while True:
            root = self._request(params)

            for record in root.findall(".//oai:record", NS):
                article = self._parse_record(record)
                if article is not None:
                    yield article

            token = _resumption_token(root)
            if token is None:
                break
            params = {"verb": "ListRecords", "resumptionToken": token}

    def get_record(self, identifier: str, metadata_prefix: str = "oai_dc") -> Article | None:
        """Executa `GetRecord` para um identificador específico."""
        root = self._request(
            {"verb": "GetRecord", "identifier": identifier, "metadataPrefix": metadata_prefix}
        )
        record = root.find(".//oai:record", NS)
        if record is None:
            return None
        return self._parse_record(record)

    def list_identifiers(
        self,
        metadata_prefix: str = "oai_dc",
        from_date: str | None = None,
        until_date: str | None = None,
        set_spec: str | None = None,
    ) -> Iterator[tuple[str, bool]]:
        """Executa `ListIdentifiers`, mais leve que `ListRecords`."""
        params = _record_params("ListIdentifiers", metadata_prefix, from_date, until_date, set_spec)

        while True:
            root = self._request(params)
            for header in root.findall(".//oai:header", NS):
                identifier = _text(header, "oai:identifier")
                deleted = header.get("status") == "deleted"
                yield (identifier, deleted)

            token = _resumption_token(root)
            if token is None:
                break
            params = {"verb": "ListIdentifiers", "resumptionToken": token}

    def _parse_record(self, record: ET.Element) -> Article | None:
        """Converte um elemento `<record>` em `Article`."""
        header = record.find("oai:header", NS)
        if header is None:
            return None

        identifier = _text(header, "oai:identifier")
        datestamp = _text(header, "oai:datestamp")
        deleted = header.get("status") == "deleted"
        article_id = _article_id_from_identifier(identifier)
        set_specs = [
            _clean_text(elem.text) for elem in header.findall("oai:setSpec", NS) if elem.text
        ]

        article = Article(
            oai_identifier=identifier,
            article_id=article_id,
            url=f"{self.base_url}/article/view/{article_id}" if article_id else "",
            datestamp=datestamp,
            deleted=deleted,
            set_spec=set_specs[0] if set_specs else "",
            set_specs=set_specs,
            dates=[datestamp] if datestamp else [],
        )

        dc = record.find(".//oai_dc:dc", NS)
        if dc is None or deleted:
            return article

        article.title = _dc_text(dc, "dc:title")
        article.creators = _dc_texts(dc, "dc:creator")
        article.subjects = _dc_texts(dc, "dc:subject")
        article.descriptions = _dc_texts(dc, "dc:description")
        article.publishers = _dc_texts(dc, "dc:publisher")
        article.contributors = _dc_texts(dc, "dc:contributor")
        article.dates = _dc_texts(dc, "dc:date") or article.dates
        article.identifiers = _dc_texts(dc, "dc:identifier")
        article.types = _dc_texts(dc, "dc:type")
        article.formats = _dc_texts(dc, "dc:format")
        article.sources = _dc_texts(dc, "dc:source")
        article.languages = _dc_texts(dc, "dc:language")
        article.coverages = _dc_texts(dc, "dc:coverage")
        article.rights = _dc_texts(dc, "dc:rights")

        article.palavras_chave = article.subjects
        article.resumo = article.descriptions[0] if article.descriptions else ""
        article.doi = _extract_doi(article.identifiers)
        article.pages = _extract_pages([*article.coverages, *article.sources])

        return article

__init__

__init__(
    base_url: str, delay: float = 1.0, timeout: float = 30.0
)

Cria um cliente OAI-PMH.

Parameters:

Name Type Description Default
base_url str

URL base do periódico OJS ou URL direta do endpoint /oai.

required
delay float

intervalo mínimo entre requisições ao mesmo servidor.

1.0
timeout float

timeout de rede em segundos.

30.0
Source code in src/ojs_scrape/oaipmh.py
def __init__(self, base_url: str, delay: float = 1.0, timeout: float = 30.0):
    """Cria um cliente OAI-PMH.

    Args:
        base_url: URL base do periódico OJS ou URL direta do endpoint `/oai`.
        delay: intervalo mínimo entre requisições ao mesmo servidor.
        timeout: timeout de rede em segundos.
    """
    normalized = base_url.rstrip("/").removesuffix("/index")
    if normalized.endswith("/oai"):
        self.oai_url = normalized
        self.base_url = normalized.removesuffix("/oai")
    else:
        self.base_url = normalized
        self.oai_url = f"{self.base_url}/oai"

    self.delay = max(delay, 0.0)
    self.timeout = timeout
    self.session = requests.Session()
    self.session.headers["User-Agent"] = USER_AGENT
    self._journal: OJSJournal | None = None
    self._last_request_at = 0.0

close

close() -> None

Fecha a sessão HTTP interna.

Source code in src/ojs_scrape/oaipmh.py
def close(self) -> None:
    """Fecha a sessão HTTP interna."""
    self.session.close()

identify

identify() -> OJSJournal

Executa Identify e retorna informações do repositório.

Source code in src/ojs_scrape/oaipmh.py
def identify(self) -> OJSJournal:
    """Executa `Identify` e retorna informações do repositório."""
    if self._journal is not None:
        return self._journal

    root = self._request({"verb": "Identify"})
    identify = root.find("oai:Identify", NS)

    journal = OJSJournal(
        base_url=self.base_url,
        oai_base_url=self.oai_url,
        repository_name=_text(identify, "oai:repositoryName"),
        admin_email=_text(identify, "oai:adminEmail"),
        earliest_datestamp=_text(identify, "oai:earliestDatestamp"),
    )
    self._journal = journal
    return journal

list_sets

list_sets() -> list[OAISet]

Executa ListSets e retorna todos os sets disponíveis.

Alguns repositórios OAI-PMH não implementam sets. Nesses casos retorna lista vazia.

Source code in src/ojs_scrape/oaipmh.py
def list_sets(self) -> list[OAISet]:
    """Executa `ListSets` e retorna todos os sets disponíveis.

    Alguns repositórios OAI-PMH não implementam sets. Nesses casos retorna lista vazia.
    """
    sets: list[OAISet] = []
    params = {"verb": "ListSets"}

    while True:
        try:
            root = self._request(params)
        except OAIPMHError as exc:
            if exc.code == "noSetHierarchy":
                return []
            raise

        for set_elem in root.findall(".//oai:set", NS):
            sets.append(
                OAISet(
                    spec=_text(set_elem, "oai:setSpec"),
                    name=_text(set_elem, "oai:setName"),
                )
            )

        token = _resumption_token(root)
        if token is None:
            break
        params = {"verb": "ListSets", "resumptionToken": token}

    return sets

list_records

list_records(
    metadata_prefix: str = "oai_dc",
    from_date: str | None = None,
    until_date: str | None = None,
    set_spec: str | None = None,
) -> Iterator[Article]

Executa ListRecords e gera artigos via paginação.

Source code in src/ojs_scrape/oaipmh.py
def list_records(
    self,
    metadata_prefix: str = "oai_dc",
    from_date: str | None = None,
    until_date: str | None = None,
    set_spec: str | None = None,
) -> Iterator[Article]:
    """Executa `ListRecords` e gera artigos via paginação."""
    params = _record_params("ListRecords", metadata_prefix, from_date, until_date, set_spec)

    while True:
        root = self._request(params)

        for record in root.findall(".//oai:record", NS):
            article = self._parse_record(record)
            if article is not None:
                yield article

        token = _resumption_token(root)
        if token is None:
            break
        params = {"verb": "ListRecords", "resumptionToken": token}

get_record

get_record(
    identifier: str, metadata_prefix: str = "oai_dc"
) -> Article | None

Executa GetRecord para um identificador específico.

Source code in src/ojs_scrape/oaipmh.py
def get_record(self, identifier: str, metadata_prefix: str = "oai_dc") -> Article | None:
    """Executa `GetRecord` para um identificador específico."""
    root = self._request(
        {"verb": "GetRecord", "identifier": identifier, "metadataPrefix": metadata_prefix}
    )
    record = root.find(".//oai:record", NS)
    if record is None:
        return None
    return self._parse_record(record)

list_identifiers

list_identifiers(
    metadata_prefix: str = "oai_dc",
    from_date: str | None = None,
    until_date: str | None = None,
    set_spec: str | None = None,
) -> Iterator[tuple[str, bool]]

Executa ListIdentifiers, mais leve que ListRecords.

Source code in src/ojs_scrape/oaipmh.py
def list_identifiers(
    self,
    metadata_prefix: str = "oai_dc",
    from_date: str | None = None,
    until_date: str | None = None,
    set_spec: str | None = None,
) -> Iterator[tuple[str, bool]]:
    """Executa `ListIdentifiers`, mais leve que `ListRecords`."""
    params = _record_params("ListIdentifiers", metadata_prefix, from_date, until_date, set_spec)

    while True:
        root = self._request(params)
        for header in root.findall(".//oai:header", NS):
            identifier = _text(header, "oai:identifier")
            deleted = header.get("status") == "deleted"
            yield (identifier, deleted)

        token = _resumption_token(root)
        if token is None:
            break
        params = {"verb": "ListIdentifiers", "resumptionToken": token}

Modelos

ojs_scrape.models

Modelos de dados para registros OAI-PMH e OJS.

OAISet dataclass

Representa um set OAI-PMH.

Source code in src/ojs_scrape/models.py
@dataclass(slots=True)
class OAISet:
    """Representa um set OAI-PMH."""

    spec: str = ""
    name: str = ""

OJSJournal dataclass

Representa um periódico OJS exposto via OAI-PMH.

Source code in src/ojs_scrape/models.py
@dataclass(slots=True)
class OJSJournal:
    """Representa um periódico OJS exposto via OAI-PMH."""

    base_url: str = ""
    repository_name: str = ""
    admin_email: str = ""
    earliest_datestamp: str = ""
    oai_base_url: str = ""
    sets: list[OAISet] = field(default_factory=list)

Article dataclass

Representa um artigo de periódico OJS.

Source code in src/ojs_scrape/models.py
@dataclass(slots=True)
class Article:
    """Representa um artigo de periódico OJS."""

    # Identificação
    oai_identifier: str = ""
    article_id: int = 0
    url: str = ""
    datestamp: str = ""

    # Metadados Dublin Core
    title: str = ""
    subtitle: str = ""
    creators: list[str] = field(default_factory=list)
    subjects: list[str] = field(default_factory=list)
    descriptions: list[str] = field(default_factory=list)
    publishers: list[str] = field(default_factory=list)
    contributors: list[str] = field(default_factory=list)
    dates: list[str] = field(default_factory=list)
    identifiers: list[str] = field(default_factory=list)  # DOI, URLs
    types: list[str] = field(default_factory=list)
    formats: list[str] = field(default_factory=list)
    sources: list[str] = field(default_factory=list)
    languages: list[str] = field(default_factory=list)
    coverages: list[str] = field(default_factory=list)  # páginas, recorte espacial etc.
    rights: list[str] = field(default_factory=list)

    # Campos derivados
    doi: str = ""
    pages: str = ""
    resumo: str = ""
    palavras_chave: list[str] = field(default_factory=list)

    # Contexto OJS
    set_spec: str = ""
    set_specs: list[str] = field(default_factory=list)
    set_name: str = ""
    issue_id: int = 0
    issue_number: str = ""
    section: str = ""
    pdf_url: str = ""

    # Status
    deleted: bool = False

    def to_dict(self) -> ArticleDict:
        """Converte o artigo em dicionário plano para serialização."""
        return {
            "oai_identifier": self.oai_identifier,
            "article_id": self.article_id,
            "url": self.url,
            "datestamp": self.datestamp,
            "title": self.title,
            "subtitle": self.subtitle,
            "creators": self.creators,
            "subjects": self.subjects,
            "descriptions": self.descriptions,
            "publishers": self.publishers,
            "contributors": self.contributors,
            "dates": self.dates,
            "identifiers": self.identifiers,
            "types": self.types,
            "formats": self.formats,
            "sources": self.sources,
            "languages": self.languages,
            "coverages": self.coverages,
            "rights": self.rights,
            "doi": self.doi,
            "pages": self.pages,
            "resumo": self.resumo,
            "palavras_chave": self.palavras_chave,
            "set_spec": self.set_spec,
            "set_specs": self.set_specs,
            "set_name": self.set_name,
            "issue_id": self.issue_id,
            "issue_number": self.issue_number,
            "section": self.section,
            "pdf_url": self.pdf_url,
            "deleted": self.deleted,
        }

to_dict

to_dict() -> ArticleDict

Converte o artigo em dicionário plano para serialização.

Source code in src/ojs_scrape/models.py
def to_dict(self) -> ArticleDict:
    """Converte o artigo em dicionário plano para serialização."""
    return {
        "oai_identifier": self.oai_identifier,
        "article_id": self.article_id,
        "url": self.url,
        "datestamp": self.datestamp,
        "title": self.title,
        "subtitle": self.subtitle,
        "creators": self.creators,
        "subjects": self.subjects,
        "descriptions": self.descriptions,
        "publishers": self.publishers,
        "contributors": self.contributors,
        "dates": self.dates,
        "identifiers": self.identifiers,
        "types": self.types,
        "formats": self.formats,
        "sources": self.sources,
        "languages": self.languages,
        "coverages": self.coverages,
        "rights": self.rights,
        "doi": self.doi,
        "pages": self.pages,
        "resumo": self.resumo,
        "palavras_chave": self.palavras_chave,
        "set_spec": self.set_spec,
        "set_specs": self.set_specs,
        "set_name": self.set_name,
        "issue_id": self.issue_id,
        "issue_number": self.issue_number,
        "section": self.section,
        "pdf_url": self.pdf_url,
        "deleted": self.deleted,
    }

Filtros

ojs_scrape.filters

Filtros para artigos coletados via OAI-PMH.

filter_by_author

filter_by_author(
    articles: Sequence[Article],
    query: str,
    *,
    case_sensitive: bool = False,
) -> list[Article]

Filtra artigos por nome de autor usando busca por substring.

Source code in src/ojs_scrape/filters.py
def filter_by_author(
    articles: Sequence[Article],
    query: str,
    *,
    case_sensitive: bool = False,
) -> list[Article]:
    """Filtra artigos por nome de autor usando busca por substring."""
    needle = query if case_sensitive else query.casefold()

    results: list[Article] = []
    for article in articles:
        if article.deleted:
            continue
        for creator in article.creators:
            target = creator if case_sensitive else creator.casefold()
            if needle in target:
                results.append(article)
                break
    return results

filter_by_issue_ids

filter_by_issue_ids(
    articles: Sequence[Article],
    issue_articles: Mapping[int, IssueArticleMetadata],
) -> list[Article]

Filtra e enriquece artigos que pertencem às edições informadas.

Source code in src/ojs_scrape/filters.py
def filter_by_issue_ids(
    articles: Sequence[Article],
    issue_articles: Mapping[int, IssueArticleMetadata],
) -> list[Article]:
    """Filtra e enriquece artigos que pertencem às edições informadas."""
    results: list[Article] = []
    for article in articles:
        metadata = issue_articles.get(article.article_id)
        if metadata is None:
            continue
        _enrich_article_from_issue_metadata(article, metadata)
        results.append(article)
    return results

filter_by_set

filter_by_set(
    articles: Sequence[Article], set_specs: Sequence[str]
) -> list[Article]

Filtra artigos por set OAI-PMH.

Source code in src/ojs_scrape/filters.py
def filter_by_set(articles: Sequence[Article], set_specs: Sequence[str]) -> list[Article]:
    """Filtra artigos por set OAI-PMH."""
    wanted = set(set_specs)
    return [
        article for article in articles if set(article.set_specs or [article.set_spec]) & wanted
    ]

filter_by_date_range

filter_by_date_range(
    articles: Sequence[Article],
    from_year: int | None = None,
    until_year: int | None = None,
) -> list[Article]

Filtra artigos por ano de publicação.

Source code in src/ojs_scrape/filters.py
def filter_by_date_range(
    articles: Sequence[Article],
    from_year: int | None = None,
    until_year: int | None = None,
) -> list[Article]:
    """Filtra artigos por ano de publicação."""
    results: list[Article] = []
    for article in articles:
        if article.deleted or not article.dates:
            continue
        year = _year_from_date(article.dates[0])
        if year is None:
            continue
        if from_year is not None and year < from_year:
            continue
        if until_year is not None and year > until_year:
            continue
        results.append(article)
    return results

filter_by_publication_date_range

filter_by_publication_date_range(
    articles: Sequence[Article],
    from_date: str | None = None,
    until_date: str | None = None,
) -> list[Article]

Filtra artigos por data de publicação (dc:date), não por datestamp OAI.

Source code in src/ojs_scrape/filters.py
def filter_by_publication_date_range(
    articles: Sequence[Article],
    from_date: str | None = None,
    until_date: str | None = None,
) -> list[Article]:
    """Filtra artigos por data de publicação (`dc:date`), não por datestamp OAI."""
    from_key = _date_key(from_date)
    until_key = _date_key(until_date)
    results: list[Article] = []

    for article in articles:
        if article.deleted or not article.dates:
            continue
        publication_key = _date_key(article.dates[0])
        if publication_key is None:
            continue
        if from_key is not None and publication_key < from_key:
            continue
        if until_key is not None and publication_key > until_key:
            continue
        results.append(article)

    return results

Exportadores

ojs_scrape.exporters

Exportação de artigos em formatos tabulares e bibliográficos.

to_json

to_json(
    articles: Sequence[Article],
    output: PathLike | None = None,
    indent: int = 2,
) -> str

Exporta artigos como JSON.

Source code in src/ojs_scrape/exporters.py
def to_json(articles: Sequence[Article], output: PathLike | None = None, indent: int = 2) -> str:
    """Exporta artigos como JSON."""
    data = [article.to_dict() for article in articles if not article.deleted]
    result = json.dumps(data, ensure_ascii=False, indent=indent)

    if output is not None:
        Path(output).write_text(result, encoding="utf-8")

    return result

to_csv

to_csv(
    articles: Sequence[Article],
    output: PathLike | None = None,
) -> str

Exporta artigos como CSV.

Source code in src/ojs_scrape/exporters.py
def to_csv(articles: Sequence[Article], output: PathLike | None = None) -> str:
    """Exporta artigos como CSV."""
    buffer = StringIO(newline="")
    writer = csv.DictWriter(buffer, fieldnames=CSV_FIELDS, extrasaction="ignore")
    writer.writeheader()

    for article in articles:
        if article.deleted:
            continue
        row = article.to_dict()
        row["creators"] = "; ".join(article.creators)
        row["palavras_chave"] = "; ".join(article.palavras_chave)
        row["dates"] = "; ".join(article.dates)
        writer.writerow(row)

    result = buffer.getvalue()

    if output is not None:
        Path(output).write_text(result, encoding="utf-8")

    return result

to_bibtex

to_bibtex(
    articles: Sequence[Article],
    output: PathLike | None = None,
) -> str

Exporta artigos como BibTeX.

Source code in src/ojs_scrape/exporters.py
def to_bibtex(articles: Sequence[Article], output: PathLike | None = None) -> str:
    """Exporta artigos como BibTeX."""
    entries: list[str] = []

    for index, article in enumerate((a for a in articles if not a.deleted), start=1):
        key = _bibtex_key(article, index)
        fields = _bibtex_fields(article)
        lines = [f"@article{{{key},"]
        lines.extend(f"  {name} = {{{_bibtex_escape(value)}}}," for name, value in fields)
        lines.append("}")
        entries.append("\n".join(lines))

    result = "\n\n".join(entries)

    if output is not None:
        Path(output).write_text(result, encoding="utf-8")

    return result

TOC OJS

ojs_scrape.toc

Scraping leve de TOCs (Table of Contents) de edições OJS.

TocArticle

Bases: TypedDict

Metadados mínimos de um artigo extraídos da TOC.

Source code in src/ojs_scrape/toc.py
class TocArticle(TypedDict):
    """Metadados mínimos de um artigo extraídos da TOC."""

    title: str
    url: str
    section: str
    authors: str
    pages: str
    pdf_url: str
    issue_id: int
    issue_number: str

IssueToc

Bases: TypedDict

Resultado do scrape da TOC de uma edição OJS.

Source code in src/ojs_scrape/toc.py
class IssueToc(TypedDict):
    """Resultado do scrape da TOC de uma edição OJS."""

    articles: dict[int, TocArticle]
    issue_id: int
    issue_number: str
    sections: list[str]

IssueInfo

Bases: TypedDict

Informação básica de uma edição na página de arquivo.

Source code in src/ojs_scrape/toc.py
class IssueInfo(TypedDict):
    """Informação básica de uma edição na página de arquivo."""

    issue_id: int
    url: str
    title: str

scrape_issue_toc

scrape_issue_toc(
    issue_url: str, timeout: float = 30.0
) -> IssueToc

Faz scrape da TOC de uma edição OJS e retorna article_id → metadados.

Source code in src/ojs_scrape/toc.py
def scrape_issue_toc(issue_url: str, timeout: float = 30.0) -> IssueToc:
    """Faz scrape da TOC de uma edição OJS e retorna article_id → metadados."""
    response = requests.get(issue_url, headers={"User-Agent": USER_AGENT}, timeout=timeout)
    response.raise_for_status()
    return _parse_issue_toc_html(response.text, issue_url)

find_issue_urls

find_issue_urls(
    base_url: str,
    archive_path: str = "issue/archive",
    timeout: float = 30.0,
) -> list[IssueInfo]

Faz scrape da página de arquivo do periódico e retorna edições disponíveis.

Source code in src/ojs_scrape/toc.py
def find_issue_urls(
    base_url: str,
    archive_path: str = "issue/archive",
    timeout: float = 30.0,
) -> list[IssueInfo]:
    """Faz scrape da página de arquivo do periódico e retorna edições disponíveis."""
    archive_url = urljoin(f"{base_url.rstrip('/')}/", archive_path)
    response = requests.get(archive_url, headers={"User-Agent": USER_AGENT}, timeout=timeout)
    response.raise_for_status()

    soup = BeautifulSoup(response.text, "html.parser")
    issues_by_id: dict[int, IssueInfo] = {}

    for issue_elem in soup.find_all("a", href=ISSUE_RE):
        href = str(issue_elem.get("href", ""))
        issue_id = _issue_id_from_url(href)
        if issue_id == 0:
            continue
        issues_by_id[issue_id] = {
            "issue_id": issue_id,
            "url": urljoin(archive_url, href),
            "title": _node_text(issue_elem),
        }

    return list(issues_by_id.values())

PDFs

ojs_scrape.pdf

Download de PDFs de artigos OJS.

download_pdf

download_pdf(
    article_url: str,
    output_dir: str | Path,
    filename: str | None = None,
    timeout: float = 60.0,
    session: Session | None = None,
) -> Path | None

Baixa o PDF de um artigo OJS.

Tenta primeiro a URL padrão /download. Se falhar, procura um link de PDF na página do artigo.

Source code in src/ojs_scrape/pdf.py
def download_pdf(
    article_url: str,
    output_dir: str | Path,
    filename: str | None = None,
    timeout: float = 60.0,
    session: requests.Session | None = None,
) -> Path | None:
    """Baixa o PDF de um artigo OJS.

    Tenta primeiro a URL padrão `/download`. Se falhar, procura um link de PDF na página do artigo.
    """
    destination = Path(output_dir)
    destination.mkdir(parents=True, exist_ok=True)

    http = session or requests.Session()
    close_session = session is None
    try:
        pdf = _fetch_pdf(http, article_url, timeout)
        if pdf is None:
            pdf_url = _direct_download_url(article_url)
            pdf = _fetch_pdf(http, pdf_url, timeout)
        if pdf is None:
            fallback_url = find_pdf_url(article_url, timeout=timeout, session=http)
            if fallback_url:
                for candidate_url in _pdf_candidate_urls(fallback_url):
                    pdf = _fetch_pdf(http, candidate_url, timeout)
                    if pdf is not None:
                        break

        if pdf is None:
            logger.warning("Não foi possível baixar PDF: %s", article_url)
            return None

        safe_filename = _safe_filename(filename or f"{_article_id(article_url)}.pdf")
        filepath = destination / safe_filename
        filepath.write_bytes(pdf)
        logger.info("PDF baixado: %s", filepath)
        return filepath
    finally:
        if close_session:
            http.close()

find_pdf_url

find_pdf_url(
    article_url: str,
    timeout: float = 30.0,
    session: Session | None = None,
) -> str

Procura a URL de PDF em uma página de artigo OJS.

Source code in src/ojs_scrape/pdf.py
def find_pdf_url(
    article_url: str,
    timeout: float = 30.0,
    session: requests.Session | None = None,
) -> str:
    """Procura a URL de PDF em uma página de artigo OJS."""
    http = session or requests.Session()
    close_session = session is None
    try:
        response = http.get(article_url, headers={"User-Agent": USER_AGENT}, timeout=timeout)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")

        links = list(soup.find_all("a", href=True))
        for link in links:
            href = str(link.get("href", ""))
            text = link.get_text(" ", strip=True)
            raw_classes = link.attrs.get("class", "")
            classes = (
                " ".join(str(item) for item in raw_classes)
                if isinstance(raw_classes, list)
                else str(raw_classes or "")
            )
            if _is_ojs_galley_link(href, text, classes):
                return urljoin(article_url, href)

        for link in links:
            href = str(link.get("href", ""))
            text = link.get_text(" ", strip=True).casefold()
            if "download" in href.casefold() or "pdf" in text:
                return urljoin(article_url, href)
    except requests.RequestException as exc:
        logger.warning("Falha ao procurar PDF em %s: %s", article_url, exc)
    finally:
        if close_session:
            http.close()

    return ""

download_pdfs

download_pdfs(
    articles: Sequence[Article],
    output_dir: str | Path,
    timeout: float = 60.0,
    limit: int | None = None,
) -> list[Path]

Baixa PDFs de múltiplos artigos.

Source code in src/ojs_scrape/pdf.py
def download_pdfs(
    articles: Sequence[Article],
    output_dir: str | Path,
    timeout: float = 60.0,
    limit: int | None = None,
) -> list[Path]:
    """Baixa PDFs de múltiplos artigos."""
    downloaded: list[Path] = []
    output_path = Path(output_dir)
    selected_articles = list(articles[:limit] if limit is not None else articles)
    with requests.Session() as session:
        for article in selected_articles:
            url = article.pdf_url or article.url
            if not url:
                continue
            filename = f"{article.article_id}.pdf" if article.article_id else None
            if filename:
                existing = output_path / _safe_filename(filename)
                if existing.exists():
                    downloaded.append(existing)
                    continue
            result = download_pdf(
                url,
                output_path,
                filename=filename,
                timeout=timeout,
                session=session,
            )
            if result is not None:
                downloaded.append(result)
    return downloaded