Source code for sci_watch.source_wrappers.arxiv_wrapper

from datetime import datetime

import arxiv
import requests
from bs4 import BeautifulSoup

from sci_watch.source_wrappers.abstract_wrapper import SourceWrapper
from sci_watch.source_wrappers.document import Document
from sci_watch.utils.logger import get_logger

LOGGER = get_logger(__name__)

_ARXIV_QUERY_MAX_NUMBER_OF_DOCUMENTS = 200


[docs]class ArxivWrapper(SourceWrapper):
    """
    Wrapper for Arxiv
    """

    def __init__(
        self,
        search_topic: str,
        max_documents: int,
        start_date: datetime,
        end_date: datetime,
        use_abstract_as_content: bool = True,
    ) -> None:
        """
        Parameters
        ----------
        search_topic: str
            Search topic (e.g. "cs" for Computer Science)
        max_documents: int
            Maximum number of papers to retrieve before testing the query
        start_date: datetime
            Start date to consider papers (papers published before that date will be ignored)
        end_date: datetime
            End date to consider papers (papers published after that date will be ignored)
        use_abstract_as_content: bool
            Whether the use the abstract of the paper as its content
        """

        if max_documents not in [25, 50, 100, 250, 500, 1000, 2000]:
            raise ValueError(
                "max_documents should be one of 25, 50, 100, 250, 500, 1000, 2000, got {max_documents}"
            )

        self.search_topic = search_topic
        self.max_documents = max_documents
        self.start_date = start_date
        self.end_date = end_date

        self.use_abstract_as_content = use_abstract_as_content

        self.documents: list[Document] = []

    def __repr__(self) -> str:
        return f"ArxivWrapper(search_topic={self.search_topic}, \
        max_document={self.max_documents}, use_abstract_as_content={self.use_abstract_as_content})"

    def _get_latest_papers_ids(self, max_papers: int = 500) -> list[str]:
        """
        Get latest `max_papers` papers ids from the given `self.search_topic`

        Parameters
        ----------
        max_papers: int
            Maximum number of papers ids to retrieve

        Returns
        -------
        list[str]
            A list `max_papers` papers ids retrieved from the topic `self.search_topic`
        """
        main_page_html = requests.get(
            f"https://arxiv.org/list/{self.search_topic}/recent?show={max_papers}"
        )

        soup = BeautifulSoup(main_page_html.text, "html.parser")
        ids = []

        articles_section = soup.findAll("dl", {"id": "articles"})[0]

        abstract_tags = articles_section.findAll("a", {"title": "Abstract"})
        for tag in abstract_tags:
            current_arxiv_id = tag["id"]
            ids.append(current_arxiv_id)

        return ids

[docs]    def update_documents(self) -> None:
        """
        Update the `self.documents` by looking for the latest papers (between `self.start_date` and
        end `self.end_date`)
        """
        LOGGER.info(
            "Checking Arxiv papers from %s to %s",
            datetime.strftime(self.start_date, "%d %B %Y"),
            datetime.strftime(self.end_date, "%d %B %Y"),
        )
        paper_ids = self._get_latest_papers_ids(max_papers=self.max_documents)

        LOGGER.info("Retrieved %d paper ids", len(paper_ids))

        query_results = []
        # batch the papers retrieval since we cannot retrieve more than ~200 documents
        for idx in range(0, len(paper_ids), _ARXIV_QUERY_MAX_NUMBER_OF_DOCUMENTS):
            search = arxiv.Search(
                id_list=paper_ids[idx : _ARXIV_QUERY_MAX_NUMBER_OF_DOCUMENTS + idx]
            )
            tmp_results = list(search.results())
            query_results.extend(tmp_results)

        LOGGER.info("Documents retrieved: %d", len(query_results))

        self.documents = []
        for query_result in query_results:
            document_date = query_result.updated  # query_result.published
            if self.start_date <= document_date <= self.end_date:
                if self.use_abstract_as_content:
                    content = query_result.summary
                else:
                    raise NotImplementedError("Not for now...")

                self.documents.append(
                    Document(
                        title=query_result.title.strip(),
                        url=query_result.pdf_url,
                        date=query_result.published,
                        content=content.strip(),
                    )
                )
        if len(self.documents) == 0:
            LOGGER.warning("Update documents resulted in an empty list in ArxivWrapper")
        else:
            LOGGER.info("%d papers retrieved from ArxivWrapper", len(self.documents))