Source code for fundamentals.files.fileChunker

#!/usr/local/bin/python
# encoding: utf-8
"""
*Iterate through large line-based files in batches of lines*

Author
: David Young
"""

from builtins import range
from builtins import object
import sys
import os

os.environ["TERM"] = "vt100"
from fundamentals import tools
import codecs


[docs] class fileChunker(object): """ *The fileChunker iterator - iterate over large line-based files to reduce memory footprint* **Key Arguments** - ``filepath`` -- path to the large file to iterate over - ``batchSize`` -- size of the chunks to return in lines **Usage** To setup your logger, settings and database connections, please use the ``fundamentals`` package (see tutorial here https://fundamentals.readthedocs.io/en/master/initialisation.html). To initiate a fileChunker iterator and then process the file in batches of 100000 lines, use the following: ```python from fundamentals.files import fileChunker fc = fileChunker( filepath="/path/to/large/file.csv", batchSize=100000 ) for i in fc: print len(i) ``` """ def __init__(self, filepath, batchSize): self.filepath = filepath self.batchSize = batchSize try: self.readFile = codecs.open(self.filepath, encoding="utf-8", mode="r") except IOError as e: message = "could not open the file %s" % (self.filepath,) raise IOError(message)
[docs] def __iter__(self): return self
[docs] def __next__(self): batch = [] for lines in range(self.batchSize): l = self.readFile.readline() if len(l): batch.append(l) if len(batch) == 0: raise StopIteration return batch