scrollable-scrapper/scraper_api.py

import time
from fastapi import FastAPI
from pydantic import BaseModel
from seleniumbase import SB

app = FastAPI(
    title="SeleniumBase Image Scraper API",
    description="An API that scrolls pages get is html content",
    version="1.0.0"
)

class ScrapeRequest(BaseModel):
    url: str

@app.post("/")
def _get_image_urls(req: ScrapeRequest):
    with SB(uc=True, headless=True) as sb:
        sb.activate_cdp_mode(req.url)
        sb.sleep(5)  # Wait for initial load

        screen_height = sb.execute_script("return window.screen.height") * 2
        i = 1
        max_scrolls = 500  # Safety limit to prevent infinite loops

        # Infinite scroll logic
        while True:
            current_scroll = screen_height * i
            sb.execute_script(f"window.scrollTo(0, {current_scroll});")
            i += 1

            time.sleep(0.5)

            scroll_height = sb.execute_script("return document.body.scrollHeight")

            # Stop if reached bottom or hit max scrolls
            if current_scroll > scroll_height or i > max_scrolls:
                break

        # Final wait for lazy-loaded elements
        sb.sleep(2)

        # Extract HTML
        page_source = sb.get_page_source()

    return {"pageSource": page_source}

if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8001)