Min løsning var manuelt at åbne BSON-filerne (med python), finde de store dokumenter og fjerne en del af det, derefter skrive BSON-objektet til en ny BSON-fil og indlæse den redigerede BSON-fil, som blev gemt i mongo.
Dette opfylder ikke mit ønske om at kunne indlæse den dumpede db til systemet uden at ændre det!
Python3:
import bson
from pprint import pprint
def get_bson_data(filename):
with open(filename, "rb") as f:
data = bson.decode_all(f.read())
return data
def report_problematics_documents(data):
problematics = []
for item in data:
if is_too_big(item):
print(item)input("give me some more...")
input("give me some more...")
problematics.append(item)
print(f"data len: {len(data)}")
print(f"problematics: {problematics}")
print(f"problematics len: {len(problematics)}")
def shrink_data(data):
for i, item in enumerate(data):
if is_too_big(item):
data[i] = shrink_item(item) # or delete it...
print(f"item shrinked: {i}")
def write_bson_file(data, filename):
new_filename = filename
with open(new_filename, "wb") as f:
for event in data:
bson_data = bson.BSON.encode(event)
f.write(bson_data)
def is_too_big(item):
# you need to implement this one...
pass
def shrink_item(item):
# you need to implement this one...
pass
def main():
bson_file_name = "/path/to/file.bson"
data = get_bson_data(bson_file_name)
report_problematics_documents(data)
shrink_data(data)
report_problematics_documents(data)
new_filename = bson_file_name + ".new"
write_bson_file(data, new_filename)
print("Load new data")
data = get_bson_data(new_filename)
report_problematics_documents(data)
if __name__ == '__main__':
main()