Source code for sci_watch.source_wrappers.arxiv_wrapper

from datetime import datetime

import arxiv
import requests
from bs4 import BeautifulSoup

from sci_watch.source_wrappers.abstract_wrapper import SourceWrapper
from sci_watch.source_wrappers.document import Document
from sci_watch.utils.logger import get_logger

LOGGER = get_logger(__name__)

_ARXIV_QUERY_MAX_NUMBER_OF_DOCUMENTS = 200


[docs]class ArxivWrapper(SourceWrapper): """ Wrapper for Arxiv """ def __init__( self, search_topic: str, max_documents: int, start_date: datetime, end_date: datetime, use_abstract_as_content: bool = True, ) -> None: """ Parameters ---------- search_topic: str Search topic (e.g. "cs" for Computer Science) max_documents: int Maximum number of papers to retrieve before testing the query start_date: datetime Start date to consider papers (papers published before that date will be ignored) end_date: datetime End date to consider papers (papers published after that date will be ignored) use_abstract_as_content: bool Whether the use the abstract of the paper as its content """ self.search_topic = search_topic self.max_documents = max_documents self.start_date = start_date self.end_date = end_date self.use_abstract_as_content = use_abstract_as_content self.documents: list[Document] = [] def __repr__(self) -> str: return f"ArxivWrapper(search_topic={self.search_topic}, \ max_document={self.max_documents}, use_abstract_as_content={self.use_abstract_as_content})" def _get_latest_papers_ids(self, max_papers: int = 500) -> list[str]: """ Get latest `max_papers` papers ids from the given `self.search_topic` Parameters ---------- max_papers: int Maximum number of papers ids to retrieve Returns ------- list[str] A list `max_papers` papers ids retrieved from the topic `self.search_topic` """ main_page_html = requests.get( f"https://arxiv.org/list/{self.search_topic}/recent?show={max_papers}" ) soup = BeautifulSoup(main_page_html.text, "html.parser") ids = [] articles_section = soup.findAll("dl", {"id": "articles"})[0] abstract_tags = articles_section.findAll("a", {"title": "Abstract"}) for tag in abstract_tags: current_arxiv_id = tag["id"] ids.append(current_arxiv_id) return ids
[docs] def update_documents(self) -> None: """ Update the `self.documents` by looking for the latest papers (between `self.start_date` and end `self.end_date`) """ LOGGER.info( "Checking Arxiv papers from %s to %s", datetime.strftime(self.start_date, "%d %B %Y"), datetime.strftime(self.end_date, "%d %B %Y"), ) paper_ids = self._get_latest_papers_ids(max_papers=self.max_documents) LOGGER.info("Retrieved %d paper ids", len(paper_ids)) query_results = [] # batch the papers retrieval since we cannot retrieve more than ~200 documents for idx in range(0, len(paper_ids), _ARXIV_QUERY_MAX_NUMBER_OF_DOCUMENTS): search = arxiv.Search( id_list=paper_ids[idx : _ARXIV_QUERY_MAX_NUMBER_OF_DOCUMENTS + idx] ) tmp_results = list(search.results()) query_results.extend(tmp_results) LOGGER.info("Documents retrieved: %d", len(query_results)) self.documents = [] for query_result in query_results: document_date = query_result.updated # query_result.published if self.start_date <= document_date <= self.end_date: if self.use_abstract_as_content: content = query_result.summary else: raise NotImplementedError("Not for now...") self.documents.append( Document( title=query_result.title.strip(), url=query_result.pdf_url, date=query_result.published, content=content.strip(), ) ) if len(self.documents) == 0: LOGGER.warning("Update documents resulted in an empty list in ArxivWrapper") else: LOGGER.info("%d papers retrieved from ArxivWrapper", len(self.documents))