50 lines
1.4 KiB
Python
50 lines
1.4 KiB
Python
import time
|
|
from fastapi import FastAPI
|
|
from pydantic import BaseModel
|
|
from seleniumbase import SB
|
|
|
|
app = FastAPI(
|
|
title="SeleniumBase Image Scraper API",
|
|
description="An API that scrolls pages get is html content",
|
|
version="1.0.0"
|
|
)
|
|
|
|
class ScrapeRequest(BaseModel):
|
|
url: str
|
|
|
|
@app.post("/")
|
|
def _get_image_urls(req: ScrapeRequest):
|
|
with SB(uc=True, headless=True) as sb:
|
|
sb.activate_cdp_mode(req.url)
|
|
sb.sleep(5) # Wait for initial load
|
|
|
|
screen_height = sb.execute_script("return window.screen.height") * 2
|
|
i = 1
|
|
max_scrolls = 500 # Safety limit to prevent infinite loops
|
|
|
|
# Infinite scroll logic
|
|
while True:
|
|
current_scroll = screen_height * i
|
|
sb.execute_script(f"window.scrollTo(0, {current_scroll});")
|
|
i += 1
|
|
|
|
time.sleep(0.5)
|
|
|
|
scroll_height = sb.execute_script("return document.body.scrollHeight")
|
|
|
|
# Stop if reached bottom or hit max scrolls
|
|
if current_scroll > scroll_height or i > max_scrolls:
|
|
break
|
|
|
|
# Final wait for lazy-loaded elements
|
|
sb.sleep(2)
|
|
|
|
# Extract HTML
|
|
page_source = sb.get_page_source()
|
|
|
|
return {"pageSource": page_source}
|
|
|
|
if __name__ == "__main__":
|
|
import uvicorn
|
|
uvicorn.run(app, host="0.0.0.0", port=8001)
|