Large CSVs

If you have a very large csv which contains some problems… here are some useful commands

CSV.parse(File.open("tiny.csv")) # fails before returning

# gem install smarter_csv
require "smarter-csv"
SmarterCSV.process('tiny.csv', {:chunk_size => 2}) # never returns

# http://harelba.github.io/q/
# brew install harelba/q/q
time q -d , -H "select npi from dec5.csv" > q_dec5.csv

diff q_dec1.csv q_dec5.csv | grep "^<" | wc -l
diff sort_q_dec1.csv sort_q_dec5.csv | grep "^<" | wc -l

line counts mismatches

# wc -l *.csv | sort
#       10 tiny.csv
#  1807551 q_dec1.csv
#  1807551 sort_q_dec1.csv
#  1807552 Order_and_Referring_Dec_1_2023.csv
#  1807552 dec1.csv
#  1807552 npis_dec1.csv
#  21698032 total
#  2531781 q_dec5.csv
#  2531781 sort_q_dec5.csv
#  2532234 Order_and_Referring_Dec_5_2023.csv
#  2532234 dec5.csv
#  2532234 npis_dec5.csv
#  2531782 python regular csv parse
#  2532374 python fancy regex parse (140 more NPIs than lines in the file)
#  2531679 python isdigit cells

import csv
x = csv.reader(open("dec5.csv"))
y = list(x)

import json
import re
z = re.findall(r"(\d+)", json.dumps(y))
z.sort()
len(z)
# >>> z[0:5]
# ['0', '0', '0', '0', '0']
# >>> z[2532370:2532374]
# ['95864', '9786117', '99', '992053854']

import collections
collections.Counter(z)["0"]
# 14
z[0:15]
# ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '003495276'

z[2532350:2532374]
# ['811973886', '813', '831202498', '841280740', '861037509', '864082', '86751238', '878', '88286520', '888', '9', '9', '9032315', '9152838', '916686', '932144730', '9405', '942517412', '94352641', '956505', '95864', '9786117', '99', '992053854']