chore: initial commit

This commit is contained in:
Rodrigo Verdiani 2026-03-31 08:52:28 -03:00
commit c26c578e01
6 changed files with 1189 additions and 0 deletions

5
.gitignore vendored Normal file
View File

@ -0,0 +1,5 @@
latest_logs/*
.idea
downloaded_files/*

48
Dockerfile Normal file
View File

@ -0,0 +1,48 @@
FROM ubuntu:24.04
ENV DEBIAN_FRONTEND=noninteractive
ENV DISPLAY=:99
# System deps
RUN apt-get update && apt-get install -y \
wget curl unzip xvfb \
libgtk-3-0 libnss3 libatk-bridge2.0-0 libcups2 \
libxcomposite1 libxrandr2 libgbm1 libasound2t64 \
libxdamage1 libxfixes3 libxkbcommon0 libpango-1.0-0 libcairo2 \
fonts-liberation fonts-noto-color-emoji \
&& rm -rf /var/lib/apt/lists/*
# Install Chrome
RUN wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb \
&& apt-get update \
&& apt-get install -y ./google-chrome-stable_current_amd64.deb \
&& rm google-chrome-stable_current_amd64.deb
# Install Python
RUN apt-get update && apt-get install -y python3 python3-venv python3-pip
# Create virtualenv
RUN python3 -m venv /venv
# Activate venv by default
ENV PATH="/venv/bin:$PATH"
# Upgrade pip inside venv
RUN pip install --upgrade pip
# Install Python deps
RUN pip install seleniumbase pika fastapi uvicorn bs4
# Install drivers
RUN seleniumbase get chromedriver
# App
WORKDIR /app
COPY scraper_api.py /app/scraper_api.py
# Entrypoint
COPY entrypoint.sh /entrypoint.sh
RUN chmod +x /entrypoint.sh
EXPOSE 8000
ENTRYPOINT ["/entrypoint.sh"]

7
entrypoint.sh Normal file
View File

@ -0,0 +1,7 @@
#!/bin/bash
# Start virtual display
Xvfb :99 -screen 0 1920x1080x24 -nolisten tcp &
# Start API/Worker
exec uvicorn scraper_api:app --host 0.0.0.0 --port 8000

10
pyproject.toml Normal file
View File

@ -0,0 +1,10 @@
[project]
name = "sb-taimu"
version = "0.1.0"
description = "Add your description here"
requires-python = ">=3.14"
dependencies = [
"fastapi>=0.135.2",
"seleniumbase>=4.47.8",
"uvicorn>=0.42.0",
]

49
scraper_api.py Normal file
View File

@ -0,0 +1,49 @@
import time
from fastapi import FastAPI
from pydantic import BaseModel
from seleniumbase import SB
app = FastAPI(
title="SeleniumBase Image Scraper API",
description="An API that scrolls pages get is html content",
version="1.0.0"
)
class ScrapeRequest(BaseModel):
url: str
@app.post("/")
def _get_image_urls(req: ScrapeRequest):
with SB(uc=True, headless=True) as sb:
sb.activate_cdp_mode(req.url)
sb.sleep(5) # Wait for initial load
screen_height = sb.execute_script("return window.screen.height") * 2
i = 1
max_scrolls = 500 # Safety limit to prevent infinite loops
# Infinite scroll logic
while True:
current_scroll = screen_height * i
sb.execute_script(f"window.scrollTo(0, {current_scroll});")
i += 1
time.sleep(0.5)
scroll_height = sb.execute_script("return document.body.scrollHeight")
# Stop if reached bottom or hit max scrolls
if current_scroll > scroll_height or i > max_scrolls:
break
# Final wait for lazy-loaded elements
sb.sleep(2)
# Extract HTML
page_source = sb.get_page_source()
return {"pageSource": page_source}
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8001)

1070
uv.lock generated Normal file

File diff suppressed because it is too large Load Diff