00001
00013 #include <iostream>
00014 #include <iomanip>
00015 #include <list>
00016 #include <vector>
00017 #include <set>
00018 #include <fstream>
00019 #include <istream>
00020 #include <algorithm>
00021 #include <sys/time.h>
00022 #include <sys/stat.h>
00023
00024
00025 using namespace std;
00026
00027 #include "Pos.hpp"
00028 #include "TrieSDisco.hpp"
00029
00030
00034 set<string> analizaConsulta(char *consulta)
00035 {
00036 string pnor;
00037 string petiqueta;
00038 set<string> ccad;
00039 if (consulta[0] == '\0') {
00040 return ccad;
00041 }
00042 int estado = 0;
00048 char pal[MAXCAD];
00049 int npal = 0;
00050 unsigned int p = 0;
00051 for (p = 0; p < strlen(consulta); p++) {
00052 if (estado == 0) {
00053 if (isalnum(consulta[p])) {
00054 estado = 1;
00055 npal = 0;
00056 pal[0] = consulta[p];
00057 } else if (consulta[p] == '"') {
00058 estado = 2;
00059 npal = -1;
00060
00061 }
00062 } else if (estado == 1) {
00063 if (isspace(consulta[p])) {
00064 estado = 0;
00065 npal++;
00066 pal[npal] = '\0';
00067 pnor = normaliza(string(pal));
00068 if (pnor != "") {
00069 ccad.insert(pnor);
00070 }
00071 } else if (consulta[p] == ':') {
00072 estado = 3;
00073 npal++;
00074 pal[npal] = '\0';
00075 petiqueta = normaliza(string(pal));
00076 npal = 0;
00077 } else if (npal <(int)MAXCAD-2) {
00078 npal++;
00079 pal[npal] = consulta[p];
00080 }
00081 } else if (estado == 2) {
00082
00083 if (consulta[p] == '"') {
00084 estado = 0;
00085 npal++;
00086 pal[npal] = '\0';
00087
00088 vector<string> partes = estalla(" ", pal);
00089 string sep = "", cres = "";
00090 for(unsigned int np = 0; np < partes.size(); np++) {
00091 pnor = normaliza(string(partes[np]));
00092
00093 if (pnor != "") {
00094 cres += sep + pnor;
00095 sep = " ";
00096 }
00097 }
00098
00099 ccad.insert(cres);
00100 } else if (npal < (int)MAXCAD-3 && (!isspace(consulta[p]) ||
00101 !isspace(consulta[p-1]))) {
00102 npal++;
00103 pal[npal] = consulta[p];
00104
00105 } else {
00106
00107 }
00108 } else if (estado == 3) {
00109 if (isspace(consulta[p])) {
00110 estado = 0;
00111 pal[npal] = '\0';
00112 npal++;
00113 pnor = normaliza(string(pal));
00114
00115
00116 if (pnor != "") {
00117 ccad.insert(petiqueta+":"+pnor);
00118 }
00119 } else if (npal < (int)(MAXCAD-2-petiqueta.length())) {
00120 pal[npal] = consulta[p];
00121 npal++;
00122 }
00123 }
00124 }
00125 if (estado == 1) {
00126 npal++;
00127 ASSERT(npal<=(int)MAXCAD-1);
00128 pal[npal] = '\0';
00129 pnor = normaliza(string(pal));
00130 if (pnor != "") {
00131 ccad.insert(pnor);
00132 }
00133 } else if (estado == 2) {
00134 npal++;
00135 ASSERT(npal<=(int)MAXCAD-2);
00136 pal[npal] = '"';
00137 npal++;
00138 pal[npal] = '\0';
00139 pnor = normaliza(string(pal));
00140 if (pnor != "") {
00141 ccad.insert(pnor);
00142 }
00143 } else if (estado == 3) {
00144 pal[npal] = '\0';
00145 npal++;
00146 pnor = normaliza(string(pal));
00147 if (pnor != "") {
00148
00149 ccad.insert(petiqueta+":"+pnor);
00150 }
00151 }
00152
00153 return ccad;
00154 }
00155
00156
00157 int compDoc(Pos p1, Pos p2)
00158 {
00159 return (p1.numd < p2.numd);
00160 }
00161
00162
00163
00168 set<unsigned long> *realizaBusqueda(char *indice, set<string> &consulta,
00169 vector<Doc> &docs)
00170 {
00171 set<unsigned long> *res = new set<unsigned long>();
00172 set<Pos> *cp = NULL, *cp2 = NULL;
00173 char relacion[MAXLURL];
00174
00175 verificaNombre(indice, relacion);
00176
00177 int mira = -1;
00178 set<Pos>::iterator cpi, cpi2;
00179 set<unsigned long>::iterator resi;
00180 set<string>::iterator i;
00181
00182 leeRelacion(relacion, docs);
00183 for (i = consulta.begin(); i != consulta.end(); i++) {
00184 vector<string> partes = estalla(" ", *i);
00185 if (mira != -1) {
00186
00187 }
00188 for(unsigned int np = 0; np < partes.size(); np++) {
00189 try {
00190
00191 fstream is(indice, ios_base::in);
00192 verificaIndice(is);
00193 try {
00194 cp2 = buscaPlanoStream(is, partes[np]);
00195 } catch (string m) {
00196 throw errorFormato(is, m);
00197 }
00198 is.close();
00199
00200 } catch (string m) {
00201 cerr << indice << ":" << m << endl;
00202 exit(1);
00203 }
00204 if (np == 0) {
00205 cp = cp2;
00206 } else {
00207 set<Pos> *cp3 = new set<Pos>();
00208
00209
00210 cpi = cp->begin();
00211 cpi2 = cp2->begin();
00212 while (cpi != cp->end() && cpi2 != cp2->end()) {
00213
00214 if ((cpi->numd < cpi2->numd)) {
00215 cpi++;
00216 } else if ((cpi->numd == cpi2->numd) &&
00217 ((unsigned int)(cpi->numb + partes[np-1].size() + 5) < (unsigned int)(cpi2->numb))) {
00218 cpi++;
00219 } else if (cpi2->numd < cpi->numd) {
00220 cpi2++;
00221 } else if ((cpi2->numd == cpi->numd) &&
00222 ((unsigned int)cpi2->numb < (unsigned int)(cpi->numb + partes[np-1].size() + 1))) {
00223 cpi2++;
00224 } else {
00225 ASSERT(cpi2->numd == cpi->numd);
00226 ASSERT((unsigned int)cpi->numb + partes[np-1].size() + 1 <= (unsigned int)cpi2->numb);
00227 ASSERT((unsigned int)cpi2->numb <= (unsigned int)cpi->numb + partes[np-1].size() + 5);
00228
00229 cp3->insert(*cpi2);
00230 cpi++;
00231 cpi2++;
00232 }
00233 }
00234 if (cp == cp2) {
00235 delete cp;
00236 } else {
00237 delete cp;
00238 delete cp2;
00239 }
00240 cp = cp3;
00241 }
00242 }
00243
00244 if (cp != NULL) {
00245 if (i == consulta.begin()) {
00246 ASSERT(res->size() == 0);
00247 for (cpi = cp->begin(); cpi != cp->end(); cpi++) {
00248
00249 res->insert(cpi->numd);
00250 docs[cpi->numd - 1].numoc++;
00251 if (mira == -1) {
00252 mira = cpi->numd - 1;
00253
00254 } else if (mira == cpi->numd - 1) {
00255
00256 }
00257 }
00258
00259 } else {
00260 set<unsigned long> *res2 =
00261 new set<unsigned long>();
00262 if (mira != -1) {
00263
00264 }
00265
00266 cpi = cp->begin();
00267 resi = res->begin();
00268 while (cpi != cp->end() && resi != res->end()) {
00269 if ((unsigned long)cpi->numd < *resi) {
00270 cpi++;
00271 } else if (*resi < (unsigned long)cpi->numd) {
00272 resi++;
00273 } else {
00274 res2->insert(cpi->numd);
00275
00276 while (*resi == (unsigned long)cpi->numd) {
00277 docs[cpi->numd - 1].numoc++;
00278 cpi++;
00279 }
00280 if (*resi == (unsigned long)mira) {
00281
00282 }
00283 resi++;
00284 }
00285 }
00286
00287
00288
00289 delete res;
00290 res = res2;
00291 }
00292 delete cp;
00293 } else {
00294 delete res;
00295 res = new set<unsigned long>();
00296 break;
00297 }
00298
00299 }
00300
00301
00302 return res;
00303 }
00304
00305
00306 string nombra_resconsulta(set<string> &cons, char *ind)
00307 {
00308 string r("../indices/consultas/");
00309 for(char *p = ind; *p != '\0'; p++) {
00310 if (isalnum(*p)) {
00311 r += *p;
00312 } else {
00313 r += '_';
00314 }
00315 }
00316 set<string>::iterator i;
00317 for(i = cons.begin(); i != cons.end(); i++) {
00318 r += "_" + (*i);
00319 }
00320 return r;
00321 }
00322
00323 bool resconsulta_reciente(string nc)
00324 {
00325 FILE *f = fopen(nc.c_str(), "r");
00326 if (f == NULL) {
00327 return false;
00328 }
00329 fclose(f);
00330
00331 struct tm* rel;
00332 struct stat atr;
00333 stat(nc.c_str(), &atr);
00334 rel = gmtime(&(atr.st_mtime));
00335 time_t o = mktime(rel);
00336 time_t actual = time_t();
00337 double d = difftime(actual, o);
00338
00339 if (d>(24*60*60)) {
00340 return false;
00341 }
00342
00343 return true;
00344 }
00345
00346
00347 void guarda_resconsulta(string nc, vector<unsigned long> *vpos)
00348 {
00349 ASSERT(nc != "");
00350 ASSERT(nc.length() < MAXLURL);
00351 ASSERT(vpos != NULL);
00352
00353 fstream os(nc.c_str(), ios_base::out);
00354 escribeNDesp(os, vpos->size());
00355 for(unsigned int i = 0; i < vpos->size(); i++) {
00356 escribeNDesp(os, (*vpos)[i]);
00357 }
00358 os.close();
00359 }
00360
00361 vector<unsigned long> *lee_resconsulta(string nc)
00362 {
00363 ASSERT(nc != "");
00364 ASSERT(nc.length() < MAXLURL);
00365
00366 fstream is(nc.c_str(), ios_base::in);
00367 if (!is) {
00368 throw std::string("No pudo abrirse" + nc);
00369 }
00370 vector<unsigned long> *vpos = new vector<unsigned long>();
00371 long t = leeNDesp(is);
00372 for (long i = 0; i < t; i++) {
00373 long n = leeNDesp(is);
00374 vpos->push_back(n);
00375 }
00376 is.close();
00377
00378 return vpos;
00379 }
00380
00381 class Esc
00382 {
00383 public:
00384 vector<Doc> *docs;
00385 Esc(vector<Doc> *d) : docs(d)
00386 {
00387 ASSERT(d != NULL);
00388 }
00389
00390 bool operator() (unsigned long i, unsigned long j)
00391 {
00392 if (i >= docs->size() || j >= docs->size()) {
00393 throw "Problema buscando. Posiblemente no coinciden índice y relación";
00394 }
00395 ASSERT(i>=0 && i<docs->size());
00396 ASSERT(j>=0 && j<docs->size());
00397 if ((*docs)[i].fecha > (*docs)[j].fecha) {
00398 return true;
00399 } else if ((*docs)[i].fecha == (*docs)[j].fecha) {
00400 return (*docs)[i].numoc > (*docs)[j].numoc;
00401 }
00402 return false;
00403 }
00404 };
00405
00406 vector<unsigned long> *escalafon(set<unsigned long> *cpos, vector<Doc> *pdocs)
00407 {
00408
00409 Esc e(pdocs);
00410
00411 set<unsigned long>::iterator cpi;
00412 vector<unsigned long> *vpos=new vector<unsigned long>();
00413 for (cpi = cpos->begin(); cpi != cpos->end(); cpi++) {
00414 vpos->push_back(*cpi-1);
00415
00416 }
00417
00418
00419 sort (vpos->begin(), vpos->end(), e);
00420
00421 return vpos;
00422 }
00423
00424
00425
00426 timespec tdiff(timespec start, timespec end)
00427 {
00428 timespec temp;
00429 if ((end.tv_nsec-start.tv_nsec)<0) {
00430 temp.tv_sec = end.tv_sec-start.tv_sec-1;
00431 temp.tv_nsec = 1000000000+end.tv_nsec-start.tv_nsec;
00432 } else {
00433 temp.tv_sec = end.tv_sec-start.tv_sec;
00434 temp.tv_nsec = end.tv_nsec-start.tv_nsec;
00435 }
00436 return temp;
00437 }
00438
00443 string& cad_remplaza(const string &busca, const string &remplaza, string &cadena)
00444 {
00445 string buffer;
00446
00447 int sealeng = busca.length();
00448 int strleng = cadena.length();
00449
00450 if (sealeng==0) {
00451 return cadena;
00452 }
00453
00454 for(int i=0, j=0; i<strleng; j=0 ) {
00455 while (i+j<strleng && j<sealeng && cadena[i+j]==busca[j]) {
00456 j++;
00457 }
00458 if (j==sealeng)
00459 {
00460 buffer.append(remplaza);
00461 i+=sealeng;
00462 } else {
00463 buffer.append( &cadena[i++], 1);
00464 }
00465 }
00466 cadena = buffer;
00467 return cadena;
00468 }
00469
00470
00471 string escapa(char *s)
00472 {
00473 string t=s;
00474 cad_remplaza("\"", "\\\"", t);
00475 return t;
00476 }
00477
00478
00479 int main(int argc, char *argv[])
00480 {
00481
00482 unsigned long inicio = 1;
00483 unsigned long fin = 10;
00484 timespec t1;
00485 clock_gettime(CLOCK_REALTIME, &t1);
00486 if (argc < 3) {
00487 cerr<<"Se esperaban al menos 2 argumentos, el primero indice por leer y el segundo consulta. 3ero número de resultado inicial"<<endl;
00488 exit(1);
00489 }
00490
00491 vector<Doc> docs;
00492 set<string> cons = analizaConsulta(argv[2]);
00493
00494
00495
00496
00497 char *f;
00498 if ((f = strstr(argv[1], ".indice")) == NULL ||
00499 (unsigned int)(f - argv[1]) != (strlen(argv[1]) - 7) ) {
00500 cerr << "se esperaba extensión .indice. "
00501 << argv[1] << endl;
00502 exit(1);
00503 }
00504
00505 string nc = nombra_resconsulta(cons, argv[1]);
00506
00507 vector<unsigned long> *vpos = NULL;
00508
00509 bool sincopiarec = false;
00510 if (argc >= 6 && strcmp(argv[5], "sincopiarecientes")) {
00511 sincopiarec = true;
00512 }
00513
00514
00515
00516 set<unsigned long> *cpos = realizaBusqueda(argv[1], cons, docs);
00517 vpos = escalafon(cpos, &docs);
00518 ASSERT(cpos->size() == vpos->size());
00519
00520
00521
00522
00523
00524
00525
00526
00527
00528
00529
00530
00531
00532
00533
00534
00535
00536
00537 if (argc >= 4) {
00538 inicio = atoi(argv[3]);
00539 if (inicio >= vpos->size() ) {
00540 inicio = vpos->size();
00541 }
00542 if (inicio <= 0) {
00543 inicio = 1;
00544 }
00545 }
00546 if (argc >= 5) {
00547 fin = atoi(argv[4]);
00548 if (fin == 0) {
00549 fin = vpos->size();
00550 }
00551 if (fin < inicio) {
00552 fin = inicio;
00553 }
00554 if (fin > vpos->size() ) {
00555 fin = vpos->size();
00556 }
00557 } else {
00558 fin = vpos->size() < 10 ? vpos->size() : 10;
00559 }
00560
00561
00562
00563
00564 cout << "{" << endl;
00565 cout << " \"consulta\": \"" << escapa(argv[2]) << "\"," << endl;
00566 cout << " \"documentos\": " << vpos->size() << "," << endl;
00567 cout << " \"inicio\": " << inicio << "," << endl;
00568 cout << " \"fin\": " << fin << "," << endl;
00569 cout << " \"resultados\": [" << endl;
00570
00571 unsigned long ns = 1;
00572 for (ns = inicio; ns <= fin; ns++) {
00573 ASSERT((*vpos)[ns - 1] >= 0 &&
00574 (unsigned int)(*vpos)[ns - 1] < docs.size());
00575 cout << " { \"url\": \"" <<
00576 docs[(*vpos)[ns - 1]].URL << "\",\n";
00577 cout << " \"frec\": \"" <<
00578 docs[(*vpos)[ns - 1]].numoc << "\",\n";
00579 cout << " \"fecha\": \"" <<
00580 docs[(*vpos)[ns - 1]].fecha << "\"\n";
00581 cout << " }";
00582 if (ns < fin) {
00583 cout << ", ";
00584 }
00585 cout << endl;
00586 }
00587
00588 timespec t2;
00589 clock_gettime(CLOCK_REALTIME, &t2);
00590
00591
00592
00593
00594 cout << " ]" << endl;
00595
00596 cout << "}" << endl;
00597
00598 return 0;
00599 }
00600