#! /usr/bin/perl -w if (defined($ARGV[0])) { $infile = $ARGV[0]; } $infile =~ m/(.*)/; $itemid = $1; $INFILE = "/caartemp/byu/xml/$itemid.xml"; $OUT_FILE = "/caartemp/byu/bstusgm/$itemid.sgm"; open OUT_FILE, ">$OUT_FILE" or die "can't open OUT_FILE"; open INFILE or die "can't open $INFILE"; $teiskip = 1; LINE:while ($line=) { $line=~s#\r##g; $line=~s###g; $line=~s#

#

#g; $line=~s###g; $line=~s###g; $line=~s###g; $line=~s###g; $line=~s###g; $line=~s#.*##g; $line=~s#/>#>#g; # deleted a stray chacter that had no value in ascii appeared in Huntington9.sgm $line=~s#\x0C##g; # change stray characters to " in xml like byu did $line=~s#\xe2\x80\x9e#"#g; # change characters to double quotes "" in xml like byu did for Huntinton9.xml $line=~s#\xE2\x80\x9C#""#g; $line=~s#\xE2\x80\x9D#""#g; # change characters to ' in xml like byu did for Huntinton9.xml $line=~s#\xE2\x80\x99#’#g; # chage endash to -- for a subject that appers in an orig tag in file smithj.xml $line=~s#\xE2\x80\x93#--#g; # chage hex for dash to - in the file Huntington9.xml $line=~s#\xE2\x80\x94#-#g; # chage unknown characters to ? like byu did in their text found first in Harris.sgm $line=~s#\xE2\x82\xA4#?#g; $line=~s#\xEF\x9E\x8D#?#g; $line=~s#\x88\xA3\x35#?#g; $line=~s#\xE2\x85\x94#?#g; $line=~s#\xE1\x83\x9D#?#g; $line=~s#\xE2\x99\xA1#?#g; $line=~s#\xE2\x99\xA2#?#g; $line=~s#\xEF\x9E\xAD#?#g; $line=~s#\xE2\x85\x93#?#g; $line=~s#\xE2\x9C\x97#?#g; $line=~s#\xE2\x85\x9E#?#g; $line=~s#\xE2\x85\x9D#?#g; $line=~s#\xCF\x87#?#g; $line=~s#\xE2\x88\xA3#?#g; # chage unknown characters to a small x like byu did in their text found first in Harris.sgm $line=~s#\xC3\x97#\nD7#g; if ($line =~m//) { $teiskip = 0; #print OUT_FILE "\n" print OUT_FILE "\n" } #if ($teiskip == 1) { # next; # } if ($teiskip==0) { if (length($line) > $maxLineLength) { $temp_line = $line; while (length($temp_line) > $maxLineLength) { $temp_line =~ m/(.{1,$maxLineLength})\s(.*)/; print OUTFILE "$1\n"; $temp_line = $2; } print OUTFILE "$temp_line\n"; } else { print OUT_FILE "$line\n"; } }