From: Jim M. <jim...@je...> - 2016-07-26 18:58:30
|
is there a partial bug in #if defined(USEFSTREAM) ? I could not reproduce when I had corrected my code, but for a while it looked as if that was being ignored and stdio stuff was being used instead. cannot reproduce the bug. strangely enough, I outputted \r\n and the resulting file does not have the \r in them and is not editable via notepad. remember real terminals before xterms? like ADM3A, they required both cr and lf to go to beginning of next line. linux is violating that defacto standard. \r is being filtered out. it should not be. in fact, colleges should go back to teaching about CR+LF (\r\n). macos is wrong on this too. /* Author: Jim Michaels, for Jim Michaels only, for now Abstract: takes joined file from KBIB_KJ1..5.zip from simtel MSDOS collection and joins the split-up verses in there so it is 1 verse per line. Created: 3-25-2016 Copyright: License: Instructions: run in same extracted dir where GEN.TXT resides. Bugs: - 1.0 access violation (pointer problem, array-out-of-bounds problem) and program does not execute. */ #define PROGRAM_NAME "KBIB_KJ1to5toBible_txt" #define PROGRAM_DESCRIPTION "takes files in KBIB_KJ1..5.zip and converts it to 1 verse per line." #define PROGRAM_VERSION "1.0" #define BUFSIZE (16777216) #define USEFSTREAM #include <string> #include <vector> #include <fstream> #include <iostream> #include <iterator> #include <initializer_list> #include <ios> #include <stdio.h> #include <regex> typedef std::vector<std::string> VS; typedef VS::size_type VST; typedef VS::iterator VSI; VS bookFilenames={ "GEN.TXT", "EXO.TXT", "LEV.TXT", "NUM.TXT", "DEU.TXT", "JOS.TXT", "JDG.TXT", "RTH.TXT", "SA1.TXT", "SA2.TXT", "KI1.TXT", "KI2.TXT", "CH1.TXT", "CH2.TXT", "EZR.TXT", "NEH.TXT", "EST.TXT", "JOB.TXT", "PSA.TXT", "PRO.TXT", "ECC.TXT", "SON.TXT", "ISA.TXT", "JER.TXT", "LAM.TXT", "EZE.TXT", "DAN.TXT", "HOS.TXT", "JOE.TXT", "AMO.TXT", "OBA.TXT", "JON.TXT", "MIC.TXT", "NAH.TXT", "HAB.TXT", "ZEP.TXT", "HAG.TXT", "ZEC.TXT", "MAL.TXT", "MAT.TXT", "MAR.TXT", "LUK.TXT", "JOH.TXT", "ACT.TXT", "ROM.TXT", "CO1.TXT", "CO2.TXT", "GAL.TXT", "EPH.TXT", "PHI.TXT", "COL.TXT", "TH1.TXT", "TH2.TXT", "TI1.TXT", "TI2.TXT", "TIT.TXT", "PHM.TXT", "HEB.TXT", "JAM.TXT", "PE1.TXT", "PE2.TXT", "JO1.TXT", "JO2.TXT", "JO3.TXT", "JUD.TXT", "REV.TXT" }; namespace str { //string type for S like std::string, std::wstring, etc. int compare(std::string first, std::string second, bool iCase=false, size_t firstPos=0) { size_t middle=((first.size()-firstPos)/2)+((first.size()-firstPos)%2)+firstPos,i; bool isMiddle=1==(first.size()-firstPos)%2; if (firstPos+second.size()>first.size()) return -2;//problem with using an int if (first.size()<second.size()) return -1; if (first.size()>second.size()) return +1; //they are both equal length here. //narrowing both-sides comparison for extra speed for (i=0; i <= (first.size()-firstPos)/*/2*/; i++) { //case-sensitive //less than if (iCase) if (std::toupper(first.at(i+firstPos))<std::toupper(second.at(i))||std::toupper(first.at(first.size()-i+firstPos))<std::toupper(second.at(second.size()-i))) return -1; //greater-than if (iCase) if (std::toupper(first.at(i+firstPos))>std::toupper(second.at(i))||std::toupper(first.at(first.size()-i+firstPos))>std::toupper(second.at(second.size()-i))) return +1; //ignore-case //less-than if (!iCase) if ( first.at(i+firstPos) < second.at(i)|| first.at(first.size()-i+firstPos)< second.at(second.size()-i)) return -1; //greater-than if (!iCase) if ( first.at(i+firstPos) > second.at(i)|| first.at(first.size()-i+firstPos)> second.at(second.size()-i)) return +1; } if (isMiddle) { //case-sensitive //less-than if (iCase) if (std::toupper(first.at(middle))<std::toupper(second.at(middle))) return -1; //greater-than if (iCase) if (std::toupper(first.at(middle))>std::toupper(second.at(middle))) return +1; //ignore-case //less-than if (!iCase) if (first.at(middle)<second.at(middle)) return -1; //greater-than if (!iCase) if (first.at(middle)>second.at(middle)) return +1; } return 0; /* std::locale loc; // get collate facet: std::collate<char>& coll = std::use_facet<std::collate<char> >(loc); first=stringlower(first); second=stringlower(second); return coll.compare( first.c_str(), first.c_str()+first.size(), second.c_str(), second.c_str()+second.size() );*/ }//compare //last item is at index std::vector< size_t >.size()-1 std::vector< size_t > find(std::string haystack, std::string needle, bool iCase=false, size_t pos=0) { size_t i; std::vector< size_t > vt; //there are probably better algorithms for string search, like Boyer-Moore etc, I think there is a faster one now. if ((0==haystack.size() && 0==needle.size()) || 0==needle.size()) { return vt; } if (0==haystack.size()) { return vt; } if (haystack.size()>needle.size()) { return vt; } if (haystack.size()==needle.size() && 0==compare(needle,haystack,iCase,pos)) { return vt; } //at this point we know that haystack.size()>=1 and needle.size()>=1 //start if (0==compare(haystack, needle, iCase, i)) { vt.push_back(i); return vt; } if (pos>=needle.size() || pos>=haystack.size()-needle.size() || pos>haystack.size()/4*4) { return vt; } //end for (size_t i=pos; i < haystack.size()-needle.size(); i++) { if (0==compare(haystack, needle, iCase, i)) { vt.push_back(i); } } return vt; }//find std::string str_replace(std::string target, std::string findwhat, std::string replacewith, size_t pos=0, bool iCase=false, bool all=true) { std::string newS="",prefindwhat; size_t i=pos,prevpos=0; std::vector< size_t > vt; /* algorithm: 1111111111222 01234567890123456789012 abc abcabctarget= abc abc,findwhat=abc,newS defg abcabctarget= abc abc,findwhat=abc defg defgabctarget= abc abc,findwhat=abc defg defgdefgtarget= abc abc,findwhat=abc find every instance of findwhat and not directly replace, but build a new string newS with replaceWith chunks interspersed between non-needles from the haystack/target. I repeat - do NOT simply replace sequentially, results are really bad - tring length gets ever longer, and it may do a forever loop. */ /* vt=find(target,findwhat, iCase, prevpos) for (intmax_t i=0; i < vt.size; i++) { } */ if (!all && (vt = find(target,findwhat, iCase, prevpos)).size() != 0/*std::string::npos*/) { //i>prevpos prefindwhat=target.substr(prevpos,i - prevpos); newS += prefindwhat + replacewith;//newS+=pre_findwhat + replacewith (prefindwhat+replaceWith) prevpos=i + findwhat.size(); newS += target.substr(prevpos);//post to end of string } while ((vt = find(target,findwhat, iCase, prevpos)).size() != 0) { //i>prevpos prefindwhat=target.substr(prevpos,i - prevpos); newS += prefindwhat + replacewith;//newS+=pre_findwhat + replacewith (prefindwhat+replaceWith) prevpos=i + findwhat.size(); } newS += target.substr(prevpos);//post return newS; } std::string stringReplace(std::string target, std::string findwhat, std::string replacewith, size_t pos=0, bool iCase=false, bool all=true) { return str_replace(target, findwhat, replacewith, iCase, pos, all); } }//namespace str char buf[BUFSIZE+1]; std::string content,beginningOfLine="\x1a",line,ofname="Stephanus1550Bible.txt"; #if defined(USEFSTREAM) std::ofstream fout; std::ifstream fin; #else FILE *fin=NULL; FILE *fout=NULL; #endif //make verse-per-line Bible out of KBIB-KJ1..5.zip's text files int main(void) { VST i; buf[BUFSIZE]='\0'; bool err; #if defined(USEFSTREAM) std::cout<<"USING FSTREAM"<<std::endl; fout.open(ofname.c_str(), std::ios_base::out|std::ios_base::trunc); if (!fout.good()) { std::cout<<"kbib-to-Bible:ERROR: cannot open "<<ofname<<" for output"<<std::endl; return 1;//return failure via %ERRORLEVEL% } std::ifstream fin; for (i=0; i < bookFilenames.size(); i++) { fin.open(bookFilenames[i].c_str()); if (!fin.good()) { std::cout<<"kbib-to-Bible:ERROR: cannot open "<<bookFilenames[i]<<" for input"<<std::endl; err=true; continue;//return failure via %ERRORLEVEL% } line=""; do { fin.getline(buf,BUFSIZE); line+=buf; } while (!fin.eof()); fin.close(); content+=line; } #else fout=fopen(ofname.c_str(),"w+"); if (NULL==fout) { printf("kbib-to-Bible:ERROR: cannot open Bible.txt for output\r\n"); err=true; return 1;//return failure via %ERRORLEVEL% } for (i = 0; i < bookFilenames.size(); i++) { fin=fopen(bookFilenames[i].c_str(),"rb"); if (NULL==fin) { printf("ERROR: cannot open %s for input\r\n",bookFilenames[i].c_str()); fclose(fout); err=true; continue;//return failure via %ERRORLEVEL% } intmax_t n=fread((void*)buf,BUFSIZE/sizeof(char),sizeof(char),fin); buf[n]='\0';//null-terminate string. this doesn't necessarily happen with "rb" printf("%lld %s\r\n", n, bookFilenames[i].c_str()); content+=buf; fclose(fin); } #endif content=str::str_replace(content,"\r",""); content=str::str_replace(content,"\n",""); content=str::str_replace(content,"\x1a","\r\n"); //content=std::regex_replace(content,std::regex("[\r\n]",std::regex::extended),"", std::regex_constants::match_continuous); //content=std::regex_replace(content,std::regex(beginningOfLine, std::regex::extended),"\r\n",std::regex_constants::match_continuous); #if defined(USEFSTREAM) fout<<content<<std::endl; fout.close(); #else fprintf(fout, "%s\r\n", content.c_str()); printf("%s", content.c_str()); fclose(fout); //printf("bufsize %lld\r\n",sizeof(buf)); //printf("%lld %s\r\n", -12,"cd"); #endif return err?1:0;//return success via %ERRORLEVEL% } |