Source code for fundamentals.files.fileChunker
#!/usr/local/bin/python
# encoding: utf-8
"""
*Iterate through large line-based files in batches of lines*
Author
: David Young
"""
from builtins import range
from builtins import object
import sys
import os
os.environ["TERM"] = "vt100"
from fundamentals import tools
import codecs
[docs]
class fileChunker(object):
"""
*The fileChunker iterator - iterate over large line-based files to reduce memory footprint*
**Key Arguments**
- ``filepath`` -- path to the large file to iterate over
- ``batchSize`` -- size of the chunks to return in lines
**Usage**
To setup your logger, settings and database connections, please use the ``fundamentals`` package (see tutorial here https://fundamentals.readthedocs.io/en/master/initialisation.html).
To initiate a fileChunker iterator and then process the file in batches of 100000 lines, use the following:
```python
from fundamentals.files import fileChunker
fc = fileChunker(
filepath="/path/to/large/file.csv",
batchSize=100000
)
for i in fc:
print len(i)
```
"""
def __init__(self, filepath, batchSize):
self.filepath = filepath
self.batchSize = batchSize
try:
self.readFile = codecs.open(self.filepath, encoding="utf-8", mode="r")
except IOError as e:
message = "could not open the file %s" % (self.filepath,)
raise IOError(message)
[docs]
def __iter__(self):
return self
[docs]
def __next__(self):
batch = []
for lines in range(self.batchSize):
l = self.readFile.readline()
if len(l):
batch.append(l)
if len(batch) == 0:
raise StopIteration
return batch