Source code for sci_watch.source_wrappers.techcrunch_wrapper

import re
from datetime import datetime, timedelta

import pytz
import requests
from bs4 import BeautifulSoup

from sci_watch.source_wrappers.abstract_wrapper import SourceWrapper
from sci_watch.source_wrappers.document import Document
from sci_watch.utils.logger import get_logger

LOGGER = get_logger(__name__)

_TECH_CRUNCH_BLOG_URL = "https://www.techcrunch.com/"


[docs]class NotABlogPost(Exception):
    ...


def _convert_blog_date(now: datetime, blog_date: str) -> datetime:
    """
    Converts TechCrunch blog date from string to datetime
    """

    blog_date = blog_date.lower()

    patterns = [
        (r"(\d+)\s*seconds?\s*ago", "seconds"),
        (r"(\d+)\s*minutes?\s*ago", "minutes"),
        (r"(\d+)\s*mins?\s*ago", "minutes"),
        (r"(\d+)\s*hours?\s*ago", "hours"),
        (r"(\d+)\s*days?\s*ago", "days"),
    ]

    if "ago" in blog_date:
        for pattern, unit in patterns:
            match = re.match(pattern, blog_date)
            if match:
                value = int(match.group(1))

                if unit == "seconds":
                    return now - timedelta(seconds=value)
                elif unit == "minutes":
                    return now - timedelta(minutes=value)
                elif unit == "hours":
                    return now - timedelta(hours=value)
                elif unit == "days":
                    return now - timedelta(days=value)
    elif "•" in blog_date:
        date_str = blog_date.split("•")[1].strip()
        return datetime.strptime(date_str, "%B %d, %Y")

    else:
        raise ValueError("Unknown date format")


[docs]class TechCrunchWrapper(SourceWrapper):
    def __init__(
        self,
        search_topic: str,
        max_documents: int,
        start_date: datetime,
        end_date: datetime,
    ):
        self.search_topic = search_topic
        self.max_documents = max_documents
        self.start_date = start_date
        self.end_date = end_date

        self.documents: list[Document] = []

    @staticmethod
    def _get_blog_content(blog_url: str) -> str:
        """
        Retrieve Tech Crunch blog post content from its url

        Parameters
        ----------
        blog_url: str
            Url to a Tech Crunch AI blog post

        Returns
        -------
        str:
            Content of the blog post
        """
        response = requests.get(blog_url)

        soup = BeautifulSoup(response.text, "html.parser")

        content_div = soup.find(
            "div",
            {
                "class": "entry-content wp-block-post-content is-layout-constrained wp-block-post-content-is-layout-constrained"
            },
        )

        try:
            content = content_div.text
        except Exception:
            raise NotABlogPost()

        return content

[docs]    def update_documents(self):
        """
        Update the `self.documents` by looking for the latest Tech Crunch AI blog posts (between
        `self.start_date` and end `self.end_date`)
        """
        LOGGER.info(
            "Checking TechCrunch blogs from %s to %s",
            datetime.strftime(self.start_date, "%d %B %Y"),
            datetime.strftime(self.end_date, "%d %B %Y"),
        )

        self.documents = []

        main_page_html = requests.get(_TECH_CRUNCH_BLOG_URL + self.search_topic)

        soup = BeautifulSoup(main_page_html.text, "html.parser")

        for tag in soup.findAll(
            "div",
            {
                "class": "loop-card loop-card--post-type-post loop-card--default loop-card--horizontal loop-card--wide loop-card--force-storyline-aspect-ratio"
            },
        ):
            tag_header = tag.findAll("a", {"class": "loop-card__title-link"})[0]

            tag_date = tag.find(
                "time",
                {
                    "class": "loop-card__meta-item loop-card__time wp-block-tc23-post-time-ago"
                },
            )

            blog_title = tag_header.get_text().strip()
            blog_url = tag_header["href"]

            blog_datetime = pytz.UTC.localize(
                _convert_blog_date(
                    now=datetime.now(), blog_date=tag_date.get_text().strip()
                )
            )

            if self.start_date <= blog_datetime <= self.end_date:
                try:
                    blog_long_content = self._get_blog_content(blog_url=blog_url)
                except NotABlogPost:
                    continue
                self.documents.append(
                    Document(
                        title=blog_title.strip(),
                        url=blog_url,
                        date=blog_datetime,
                        content=blog_long_content.strip(),
                    )
                )
        self.documents = sorted(self.documents, key=lambda doc: doc.date, reverse=True)[
            : self.max_documents
        ]

        if len(self.documents) == 0:
            LOGGER.warning(
                "Update documents resulted in an empty list in TechCrunchWrapper"
            )
        else:
            LOGGER.info(
                "%i blogs retrieved from TechCrunchWrapper", len(self.documents)
            )