#!/usr/bin/env bash
# pos2sql.sh - given a TSV file of keywords, output a set of SQL INSERT statements
# usage: mkdir -p ./tmp/sql-pos; find ./pos -name "*.pos" | sort | parallel --will-cite /export/reader/bin/pos2sql.sh
# Eric Lease Morgan <[log in to unmask]>
# (c) University of Notre Dame and distributed under a GNU Public License
# June 21, 2020 - based on other work
# configure
SQLPOS='./tmp/sql-pos'
TEMPLATE="INSERT INTO pos ( 'id', 'sid', 'tid', 'token', 'lemma', 'pos' ) VALUES ( '##ID##', '##SID##', '##TID##', '##TOKEN##', '##LEMMA##', '##POS##' );"
if [[ -z $1 ]]; then
echo "Usage: $0 " >&2
exit
fi
# initialize
TSV=$1
BASENAME=$( basename $TSV .pos )
IFS=$'\t'
# debug
echo "$BASENAME" >&2
# if the desired output already exists, then don't do it again
if [[ -f "$SQLPOS/$BASENAME.sql" ]]; then exit; fi
# extract document_id; I wish they had given me a key
DOCUMENTID=$( echo $BASENAME | cut -d'-' -f2 | sed 's/^0*//' )
# configure and then process each line in the file, sans the header
cat $TSV | tail -n +2 | ( while read ID SID TID TOKEN LEMMA POS; do
# escape
TOKEN=$( echo $TOKEN | sed "s/'/''/g" )
LEMMA=$( echo $LEMMA | sed "s/'/''/g" )
# create an INSERT statement and then update the SQL
INSERT=$( echo $TEMPLATE | sed "s/##ID##/$DOCUMENTID/" | sed "s|##SID##|$SID|" | sed "s|##TID##|$TID|" | sed "s|##TOKEN##|$TOKEN|" | sed "s|##LEMMA##|$LEMMA|" | sed "s|##POS##|$POS|" )
SQL="$SQL$INSERT\n"
done
# output
echo -e "$SQL" > "$SQLPOS/$BASENAME.sql"
)
# done
exit