[ot][spam]gpt-4 completions

Undescribed Horrific Abuse, One Victim & Survivor of Many gmkarl at gmail.com
Fri Mar 17 04:35:03 PDT 2023


2023-03-17 07:33:22	{
  "input": "Write a complete python script to maintain a mirror of a
website as a git repository, crawling the domain like wget’s spider
behavior; update every page each run:"
}	```python
import os
import sys
import time
import hashlib
import requests
from bs4 import BeautifulSoup
from git import Repo
from urllib.parse import urlparse, urljoin

# Configuration
website_url = "https://example.com"
output_directory = "website_mirror"
git_repository = "website_mirror.git"
crawl_delay = 1  # in seconds

visited_urls = set()


def is_valid_url(url):
    parsed = urlparse(url)
    return bool(parsed.netloc) and bool(parsed.scheme)


def get_all_links(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    urls = set()
    for link in soup.find_all("a"):
        href = link.get("href")
        if href and not href.startswith("#"):
            full_url = urljoin(url, href)
            if is_valid_url(full_url):
                urls.add(full_url)
    return urls


def save_page(url, content):
    parsed_url = urlparse(url)
    path = os.path.join(output_directory, parsed_url.netloc,
parsed_url.path.lstrip("/"))
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, "wb") as f:
        f.write(content)


def crawl(url):
    if url in visited_urls:
        return

    print(f"Crawling: {url}")
    visited_urls.add(url)

    try:
        response = requests.get(url)
    except requests.exceptions.RequestException as e:
        print(f"Error: {e}")
        return

    save_page(url, response.content)

    for link in get_all_links(url):
        if urlparse(link).netloc == urlparse(website_url).netloc:
            crawl(link)

    time.sleep(crawl_delay)


def main():
    if not os.path.exists(git_repository):
        print(f"Initializing git repository: {git_repository}")
        Repo.init(git_repository)

    repo = Repo(git_repository)
    os.chdir(git_repository)

    print("Crawling website...")
    crawl(website_url)

    if repo.is_dirty():
        print("Committing changes to git repository...")
        repo.git.add(A=True)
        repo.git.commit(m="Update website mirror")
    else:
        print("No changes detected.")


if __name__ == "__main__":
    main()

```	32273 ms


More information about the cypherpunks mailing list