chore: initial commit
This commit is contained in:
commit
c26c578e01
5
.gitignore
vendored
Normal file
5
.gitignore
vendored
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
latest_logs/*
|
||||||
|
|
||||||
|
.idea
|
||||||
|
|
||||||
|
downloaded_files/*
|
||||||
48
Dockerfile
Normal file
48
Dockerfile
Normal file
@ -0,0 +1,48 @@
|
|||||||
|
FROM ubuntu:24.04
|
||||||
|
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
|
ENV DISPLAY=:99
|
||||||
|
|
||||||
|
# System deps
|
||||||
|
RUN apt-get update && apt-get install -y \
|
||||||
|
wget curl unzip xvfb \
|
||||||
|
libgtk-3-0 libnss3 libatk-bridge2.0-0 libcups2 \
|
||||||
|
libxcomposite1 libxrandr2 libgbm1 libasound2t64 \
|
||||||
|
libxdamage1 libxfixes3 libxkbcommon0 libpango-1.0-0 libcairo2 \
|
||||||
|
fonts-liberation fonts-noto-color-emoji \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Install Chrome
|
||||||
|
RUN wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb \
|
||||||
|
&& apt-get update \
|
||||||
|
&& apt-get install -y ./google-chrome-stable_current_amd64.deb \
|
||||||
|
&& rm google-chrome-stable_current_amd64.deb
|
||||||
|
|
||||||
|
# Install Python
|
||||||
|
RUN apt-get update && apt-get install -y python3 python3-venv python3-pip
|
||||||
|
# Create virtualenv
|
||||||
|
RUN python3 -m venv /venv
|
||||||
|
|
||||||
|
# Activate venv by default
|
||||||
|
ENV PATH="/venv/bin:$PATH"
|
||||||
|
|
||||||
|
# Upgrade pip inside venv
|
||||||
|
RUN pip install --upgrade pip
|
||||||
|
|
||||||
|
# Install Python deps
|
||||||
|
RUN pip install seleniumbase pika fastapi uvicorn bs4
|
||||||
|
|
||||||
|
# Install drivers
|
||||||
|
RUN seleniumbase get chromedriver
|
||||||
|
|
||||||
|
# App
|
||||||
|
WORKDIR /app
|
||||||
|
COPY scraper_api.py /app/scraper_api.py
|
||||||
|
|
||||||
|
# Entrypoint
|
||||||
|
COPY entrypoint.sh /entrypoint.sh
|
||||||
|
RUN chmod +x /entrypoint.sh
|
||||||
|
|
||||||
|
EXPOSE 8000
|
||||||
|
|
||||||
|
ENTRYPOINT ["/entrypoint.sh"]
|
||||||
7
entrypoint.sh
Normal file
7
entrypoint.sh
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Start virtual display
|
||||||
|
Xvfb :99 -screen 0 1920x1080x24 -nolisten tcp &
|
||||||
|
|
||||||
|
# Start API/Worker
|
||||||
|
exec uvicorn scraper_api:app --host 0.0.0.0 --port 8000
|
||||||
10
pyproject.toml
Normal file
10
pyproject.toml
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
[project]
|
||||||
|
name = "sb-taimu"
|
||||||
|
version = "0.1.0"
|
||||||
|
description = "Add your description here"
|
||||||
|
requires-python = ">=3.14"
|
||||||
|
dependencies = [
|
||||||
|
"fastapi>=0.135.2",
|
||||||
|
"seleniumbase>=4.47.8",
|
||||||
|
"uvicorn>=0.42.0",
|
||||||
|
]
|
||||||
49
scraper_api.py
Normal file
49
scraper_api.py
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
import time
|
||||||
|
from fastapi import FastAPI
|
||||||
|
from pydantic import BaseModel
|
||||||
|
from seleniumbase import SB
|
||||||
|
|
||||||
|
app = FastAPI(
|
||||||
|
title="SeleniumBase Image Scraper API",
|
||||||
|
description="An API that scrolls pages get is html content",
|
||||||
|
version="1.0.0"
|
||||||
|
)
|
||||||
|
|
||||||
|
class ScrapeRequest(BaseModel):
|
||||||
|
url: str
|
||||||
|
|
||||||
|
@app.post("/")
|
||||||
|
def _get_image_urls(req: ScrapeRequest):
|
||||||
|
with SB(uc=True, headless=True) as sb:
|
||||||
|
sb.activate_cdp_mode(req.url)
|
||||||
|
sb.sleep(5) # Wait for initial load
|
||||||
|
|
||||||
|
screen_height = sb.execute_script("return window.screen.height") * 2
|
||||||
|
i = 1
|
||||||
|
max_scrolls = 500 # Safety limit to prevent infinite loops
|
||||||
|
|
||||||
|
# Infinite scroll logic
|
||||||
|
while True:
|
||||||
|
current_scroll = screen_height * i
|
||||||
|
sb.execute_script(f"window.scrollTo(0, {current_scroll});")
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
time.sleep(0.5)
|
||||||
|
|
||||||
|
scroll_height = sb.execute_script("return document.body.scrollHeight")
|
||||||
|
|
||||||
|
# Stop if reached bottom or hit max scrolls
|
||||||
|
if current_scroll > scroll_height or i > max_scrolls:
|
||||||
|
break
|
||||||
|
|
||||||
|
# Final wait for lazy-loaded elements
|
||||||
|
sb.sleep(2)
|
||||||
|
|
||||||
|
# Extract HTML
|
||||||
|
page_source = sb.get_page_source()
|
||||||
|
|
||||||
|
return {"pageSource": page_source}
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import uvicorn
|
||||||
|
uvicorn.run(app, host="0.0.0.0", port=8001)
|
||||||
Loading…
x
Reference in New Issue
Block a user