2023-03-17 07:33:22 { "input": "Write a complete python script to maintain a mirror of a website as a git repository, crawling the domain like wget’s spider behavior; update every page each run:" } ```python import os import sys import time import hashlib import requests from bs4 import BeautifulSoup from git import Repo from urllib.parse import urlparse, urljoin # Configuration website_url = "https://example.com" output_directory = "website_mirror" git_repository = "website_mirror.git" crawl_delay = 1 # in seconds visited_urls = set() def is_valid_url(url): parsed = urlparse(url) return bool(parsed.netloc) and bool(parsed.scheme) def get_all_links(url): response = requests.get(url) soup = BeautifulSoup(response.content, "html.parser") urls = set() for link in soup.find_all("a"): href = link.get("href") if href and not href.startswith("#"): full_url = urljoin(url, href) if is_valid_url(full_url): urls.add(full_url) return urls def save_page(url, content): parsed_url = urlparse(url) path = os.path.join(output_directory, parsed_url.netloc, parsed_url.path.lstrip("/")) os.makedirs(os.path.dirname(path), exist_ok=True) with open(path, "wb") as f: f.write(content) def crawl(url): if url in visited_urls: return print(f"Crawling: {url}") visited_urls.add(url) try: response = requests.get(url) except requests.exceptions.RequestException as e: print(f"Error: {e}") return save_page(url, response.content) for link in get_all_links(url): if urlparse(link).netloc == urlparse(website_url).netloc: crawl(link) time.sleep(crawl_delay) def main(): if not os.path.exists(git_repository): print(f"Initializing git repository: {git_repository}") Repo.init(git_repository) repo = Repo(git_repository) os.chdir(git_repository) print("Crawling website...") crawl(website_url) if repo.is_dirty(): print("Committing changes to git repository...") repo.git.add(A=True) repo.git.commit(m="Update website mirror") else: print("No changes detected.") if __name__ == "__main__": main() ``` 32273 ms