chore: initial commit
This commit is contained in:
commit
c26c578e01
5
.gitignore
vendored
Normal file
5
.gitignore
vendored
Normal file
@ -0,0 +1,5 @@
|
||||
latest_logs/*
|
||||
|
||||
.idea
|
||||
|
||||
downloaded_files/*
|
||||
48
Dockerfile
Normal file
48
Dockerfile
Normal file
@ -0,0 +1,48 @@
|
||||
FROM ubuntu:24.04
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
ENV DISPLAY=:99
|
||||
|
||||
# System deps
|
||||
RUN apt-get update && apt-get install -y \
|
||||
wget curl unzip xvfb \
|
||||
libgtk-3-0 libnss3 libatk-bridge2.0-0 libcups2 \
|
||||
libxcomposite1 libxrandr2 libgbm1 libasound2t64 \
|
||||
libxdamage1 libxfixes3 libxkbcommon0 libpango-1.0-0 libcairo2 \
|
||||
fonts-liberation fonts-noto-color-emoji \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install Chrome
|
||||
RUN wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb \
|
||||
&& apt-get update \
|
||||
&& apt-get install -y ./google-chrome-stable_current_amd64.deb \
|
||||
&& rm google-chrome-stable_current_amd64.deb
|
||||
|
||||
# Install Python
|
||||
RUN apt-get update && apt-get install -y python3 python3-venv python3-pip
|
||||
# Create virtualenv
|
||||
RUN python3 -m venv /venv
|
||||
|
||||
# Activate venv by default
|
||||
ENV PATH="/venv/bin:$PATH"
|
||||
|
||||
# Upgrade pip inside venv
|
||||
RUN pip install --upgrade pip
|
||||
|
||||
# Install Python deps
|
||||
RUN pip install seleniumbase pika fastapi uvicorn bs4
|
||||
|
||||
# Install drivers
|
||||
RUN seleniumbase get chromedriver
|
||||
|
||||
# App
|
||||
WORKDIR /app
|
||||
COPY scraper_api.py /app/scraper_api.py
|
||||
|
||||
# Entrypoint
|
||||
COPY entrypoint.sh /entrypoint.sh
|
||||
RUN chmod +x /entrypoint.sh
|
||||
|
||||
EXPOSE 8000
|
||||
|
||||
ENTRYPOINT ["/entrypoint.sh"]
|
||||
7
entrypoint.sh
Normal file
7
entrypoint.sh
Normal file
@ -0,0 +1,7 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Start virtual display
|
||||
Xvfb :99 -screen 0 1920x1080x24 -nolisten tcp &
|
||||
|
||||
# Start API/Worker
|
||||
exec uvicorn scraper_api:app --host 0.0.0.0 --port 8000
|
||||
10
pyproject.toml
Normal file
10
pyproject.toml
Normal file
@ -0,0 +1,10 @@
|
||||
[project]
|
||||
name = "sb-taimu"
|
||||
version = "0.1.0"
|
||||
description = "Add your description here"
|
||||
requires-python = ">=3.14"
|
||||
dependencies = [
|
||||
"fastapi>=0.135.2",
|
||||
"seleniumbase>=4.47.8",
|
||||
"uvicorn>=0.42.0",
|
||||
]
|
||||
49
scraper_api.py
Normal file
49
scraper_api.py
Normal file
@ -0,0 +1,49 @@
|
||||
import time
|
||||
from fastapi import FastAPI
|
||||
from pydantic import BaseModel
|
||||
from seleniumbase import SB
|
||||
|
||||
app = FastAPI(
|
||||
title="SeleniumBase Image Scraper API",
|
||||
description="An API that scrolls pages get is html content",
|
||||
version="1.0.0"
|
||||
)
|
||||
|
||||
class ScrapeRequest(BaseModel):
|
||||
url: str
|
||||
|
||||
@app.post("/")
|
||||
def _get_image_urls(req: ScrapeRequest):
|
||||
with SB(uc=True, headless=True) as sb:
|
||||
sb.activate_cdp_mode(req.url)
|
||||
sb.sleep(5) # Wait for initial load
|
||||
|
||||
screen_height = sb.execute_script("return window.screen.height") * 2
|
||||
i = 1
|
||||
max_scrolls = 500 # Safety limit to prevent infinite loops
|
||||
|
||||
# Infinite scroll logic
|
||||
while True:
|
||||
current_scroll = screen_height * i
|
||||
sb.execute_script(f"window.scrollTo(0, {current_scroll});")
|
||||
i += 1
|
||||
|
||||
time.sleep(0.5)
|
||||
|
||||
scroll_height = sb.execute_script("return document.body.scrollHeight")
|
||||
|
||||
# Stop if reached bottom or hit max scrolls
|
||||
if current_scroll > scroll_height or i > max_scrolls:
|
||||
break
|
||||
|
||||
# Final wait for lazy-loaded elements
|
||||
sb.sleep(2)
|
||||
|
||||
# Extract HTML
|
||||
page_source = sb.get_page_source()
|
||||
|
||||
return {"pageSource": page_source}
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
uvicorn.run(app, host="0.0.0.0", port=8001)
|
||||
Loading…
x
Reference in New Issue
Block a user