[ot][spam]gpt-4 completions

Undescribed Horrific Abuse, One Victim & Survivor of Many gmkarl at gmail.com
Fri Mar 17 04:23:53 PDT 2023


2023-03-17 07:21:39	{
  "input": "Write a complete python script to update a local git
repository from its remote, and then chunk all text files into content
if user-specified length with user-specified overlap, and output the
chunks in jsonlines format with full metadata:"
}	```python
import os
import sys
import json
import argparse
from git import Repo

def update_repo(repo_path):
    repo = Repo(repo_path)
    origin = repo.remotes.origin
    origin.pull()

def chunk_file(file_path, chunk_size, overlap):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    chunks = []
    start = 0
    end = chunk_size
    while end < len(content):
        chunks.append(content[start:end])
        start += chunk_size - overlap
        end += chunk_size - overlap
    chunks.append(content[start:])
    return chunks

def process_files(repo_path, chunk_size, overlap):
    output = []
    for root, dirs, files in os.walk(repo_path):
        for file in files:
            if file.endswith('.txt'):
                file_path = os.path.join(root, file)
                chunks = chunk_file(file_path, chunk_size, overlap)
                for i, chunk in enumerate(chunks):
                    metadata = {
                        'file_path': file_path,
                        'chunk_index': i,
                        'chunk_size': chunk_size,
                        'overlap': overlap,
                        'content': chunk
                    }
                    output.append(metadata)
    return output

def write_jsonlines(output, output_file):
    with open(output_file, 'w', encoding='utf-8') as file:
        for item in output:
            file.write(json.dumps(item, ensure_ascii=False) + '\n')

def main():
    parser = argparse.ArgumentParser(description='Update a local git
repository and chunk text files.')
    parser.add_argument('repo_path', help='Path to the local git repository')
    parser.add_argument('chunk_size', type=int, help='Size of the chunks')
    parser.add_argument('overlap', type=int, help='Overlap between chunks')
    parser.add_argument('output_file', help='Output file in jsonlines format')
    args = parser.parse_args()

    update_repo(args.repo_path)
    output = process_files(args.repo_path, args.chunk_size, args.overlap)
    write_jsonlines(output, args.output_file)

if __name__ == '__main__':
    main()
```

This script takes four command-line arguments: the path to the local
git repository, the desired chunk size, the desired overlap between
chunks, and the output file in jsonlines format. It updates the local
git repository from its remote, chunks all text files in the
repository into the specified size with the specified overlap, and
outputs the chunks in jsonlines format with full metadata.
33486 ms


More information about the cypherpunks mailing list