2023-03-17 07:21:39 { "input": "Write a complete python script to update a local git repository from its remote, and then chunk all text files into content if user-specified length with user-specified overlap, and output the chunks in jsonlines format with full metadata:" } ```python import os import sys import json import argparse from git import Repo def update_repo(repo_path): repo = Repo(repo_path) origin = repo.remotes.origin origin.pull() def chunk_file(file_path, chunk_size, overlap): with open(file_path, 'r', encoding='utf-8') as file: content = file.read() chunks = [] start = 0 end = chunk_size while end < len(content): chunks.append(content[start:end]) start += chunk_size - overlap end += chunk_size - overlap chunks.append(content[start:]) return chunks def process_files(repo_path, chunk_size, overlap): output = [] for root, dirs, files in os.walk(repo_path): for file in files: if file.endswith('.txt'): file_path = os.path.join(root, file) chunks = chunk_file(file_path, chunk_size, overlap) for i, chunk in enumerate(chunks): metadata = { 'file_path': file_path, 'chunk_index': i, 'chunk_size': chunk_size, 'overlap': overlap, 'content': chunk } output.append(metadata) return output def write_jsonlines(output, output_file): with open(output_file, 'w', encoding='utf-8') as file: for item in output: file.write(json.dumps(item, ensure_ascii=False) + '\n') def main(): parser = argparse.ArgumentParser(description='Update a local git repository and chunk text files.') parser.add_argument('repo_path', help='Path to the local git repository') parser.add_argument('chunk_size', type=int, help='Size of the chunks') parser.add_argument('overlap', type=int, help='Overlap between chunks') parser.add_argument('output_file', help='Output file in jsonlines format') args = parser.parse_args() update_repo(args.repo_path) output = process_files(args.repo_path, args.chunk_size, args.overlap) write_jsonlines(output, args.output_file) if __name__ == '__main__': main() ``` This script takes four command-line arguments: the path to the local git repository, the desired chunk size, the desired overlap between chunks, and the output file in jsonlines format. It updates the local git repository from its remote, chunks all text files in the repository into the specified size with the specified overlap, and outputs the chunks in jsonlines format with full metadata. 33486 ms