scrollable-scrapper/scraper_api.py

50 lines
1.4 KiB
Python

import time
from fastapi import FastAPI
from pydantic import BaseModel
from seleniumbase import SB
app = FastAPI(
title="SeleniumBase Image Scraper API",
description="An API that scrolls pages get is html content",
version="1.0.0"
)
class ScrapeRequest(BaseModel):
url: str
@app.post("/")
def _get_image_urls(req: ScrapeRequest):
with SB(uc=True, headless=True) as sb:
sb.activate_cdp_mode(req.url)
sb.sleep(5) # Wait for initial load
screen_height = sb.execute_script("return window.screen.height") * 2
i = 1
max_scrolls = 500 # Safety limit to prevent infinite loops
# Infinite scroll logic
while True:
current_scroll = screen_height * i
sb.execute_script(f"window.scrollTo(0, {current_scroll});")
i += 1
time.sleep(0.5)
scroll_height = sb.execute_script("return document.body.scrollHeight")
# Stop if reached bottom or hit max scrolls
if current_scroll > scroll_height or i > max_scrolls:
break
# Final wait for lazy-loaded elements
sb.sleep(2)
# Extract HTML
page_source = sb.get_page_source()
return {"pageSource": page_source}
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8001)