Learn how easy it is to sync an existing GitHub or Google Code repo to a SourceForge project! See Demo

Close

[r1406]: branches / bleeding-edge / includes / import.inc.php Maximize Restore History

Download this file

import.inc.php    3939 lines (3293 with data), 237.3 kB

   1
   2
   3
   4
   5
   6
   7
   8
   9
  10
  11
  12
  13
  14
  15
  16
  17
  18
  19
  20
  21
  22
  23
  24
  25
  26
  27
  28
  29
  30
  31
  32
  33
  34
  35
  36
  37
  38
  39
  40
  41
  42
  43
  44
  45
  46
  47
  48
  49
  50
  51
  52
  53
  54
  55
  56
  57
  58
  59
  60
  61
  62
  63
  64
  65
  66
  67
  68
  69
  70
  71
  72
  73
  74
  75
  76
  77
  78
  79
  80
  81
  82
  83
  84
  85
  86
  87
  88
  89
  90
  91
  92
  93
  94
  95
  96
  97
  98
  99
 100
 101
 102
 103
 104
 105
 106
 107
 108
 109
 110
 111
 112
 113
 114
 115
 116
 117
 118
 119
 120
 121
 122
 123
 124
 125
 126
 127
 128
 129
 130
 131
 132
 133
 134
 135
 136
 137
 138
 139
 140
 141
 142
 143
 144
 145
 146
 147
 148
 149
 150
 151
 152
 153
 154
 155
 156
 157
 158
 159
 160
 161
 162
 163
 164
 165
 166
 167
 168
 169
 170
 171
 172
 173
 174
 175
 176
 177
 178
 179
 180
 181
 182
 183
 184
 185
 186
 187
 188
 189
 190
 191
 192
 193
 194
 195
 196
 197
 198
 199
 200
 201
 202
 203
 204
 205
 206
 207
 208
 209
 210
 211
 212
 213
 214
 215
 216
 217
 218
 219
 220
 221
 222
 223
 224
 225
 226
 227
 228
 229
 230
 231
 232
 233
 234
 235
 236
 237
 238
 239
 240
 241
 242
 243
 244
 245
 246
 247
 248
 249
 250
 251
 252
 253
 254
 255
 256
 257
 258
 259
 260
 261
 262
 263
 264
 265
 266
 267
 268
 269
 270
 271
 272
 273
 274
 275
 276
 277
 278
 279
 280
 281
 282
 283
 284
 285
 286
 287
 288
 289
 290
 291
 292
 293
 294
 295
 296
 297
 298
 299
 300
 301
 302
 303
 304
 305
 306
 307
 308
 309
 310
 311
 312
 313
 314
 315
 316
 317
 318
 319
 320
 321
 322
 323
 324
 325
 326
 327
 328
 329
 330
 331
 332
 333
 334
 335
 336
 337
 338
 339
 340
 341
 342
 343
 344
 345
 346
 347
 348
 349
 350
 351
 352
 353
 354
 355
 356
 357
 358
 359
 360
 361
 362
 363
 364
 365
 366
 367
 368
 369
 370
 371
 372
 373
 374
 375
 376
 377
 378
 379
 380
 381
 382
 383
 384
 385
 386
 387
 388
 389
 390
 391
 392
 393
 394
 395
 396
 397
 398
 399
 400
 401
 402
 403
 404
 405
 406
 407
 408
 409
 410
 411
 412
 413
 414
 415
 416
 417
 418
 419
 420
 421
 422
 423
 424
 425
 426
 427
 428
 429
 430
 431
 432
 433
 434
 435
 436
 437
 438
 439
 440
 441
 442
 443
 444
 445
 446
 447
 448
 449
 450
 451
 452
 453
 454
 455
 456
 457
 458
 459
 460
 461
 462
 463
 464
 465
 466
 467
 468
 469
 470
 471
 472
 473
 474
 475
 476
 477
 478
 479
 480
 481
 482
 483
 484
 485
 486
 487
 488
 489
 490
 491
 492
 493
 494
 495
 496
 497
 498
 499
 500
 501
 502
 503
 504
 505
 506
 507
 508
 509
 510
 511
 512
 513
 514
 515
 516
 517
 518
 519
 520
 521
 522
 523
 524
 525
 526
 527
 528
 529
 530
 531
 532
 533
 534
 535
 536
 537
 538
 539
 540
 541
 542
 543
 544
 545
 546
 547
 548
 549
 550
 551
 552
 553
 554
 555
 556
 557
 558
 559
 560
 561
 562
 563
 564
 565
 566
 567
 568
 569
 570
 571
 572
 573
 574
 575
 576
 577
 578
 579
 580
 581
 582
 583
 584
 585
 586
 587
 588
 589
 590
 591
 592
 593
 594
 595
 596
 597
 598
 599
 600
 601
 602
 603
 604
 605
 606
 607
 608
 609
 610
 611
 612
 613
 614
 615
 616
 617
 618
 619
 620
 621
 622
 623
 624
 625
 626
 627
 628
 629
 630
 631
 632
 633
 634
 635
 636
 637
 638
 639
 640
 641
 642
 643
 644
 645
 646
 647
 648
 649
 650
 651
 652
 653
 654
 655
 656
 657
 658
 659
 660
 661
 662
 663
 664
 665
 666
 667
 668
 669
 670
 671
 672
 673
 674
 675
 676
 677
 678
 679
 680
 681
 682
 683
 684
 685
 686
 687
 688
 689
 690
 691
 692
 693
 694
 695
 696
 697
 698
 699
 700
 701
 702
 703
 704
 705
 706
 707
 708
 709
 710
 711
 712
 713
 714
 715
 716
 717
 718
 719
 720
 721
 722
 723
 724
 725
 726
 727
 728
 729
 730
 731
 732
 733
 734
 735
 736
 737
 738
 739
 740
 741
 742
 743
 744
 745
 746
 747
 748
 749
 750
 751
 752
 753
 754
 755
 756
 757
 758
 759
 760
 761
 762
 763
 764
 765
 766
 767
 768
 769
 770
 771
 772
 773
 774
 775
 776
 777
 778
 779
 780
 781
 782
 783
 784
 785
 786
 787
 788
 789
 790
 791
 792
 793
 794
 795
 796
 797
 798
 799
 800
 801
 802
 803
 804
 805
 806
 807
 808
 809
 810
 811
 812
 813
 814
 815
 816
 817
 818
 819
 820
 821
 822
 823
 824
 825
 826
 827
 828
 829
 830
 831
 832
 833
 834
 835
 836
 837
 838
 839
 840
 841
 842
 843
 844
 845
 846
 847
 848
 849
 850
 851
 852
 853
 854
 855
 856
 857
 858
 859
 860
 861
 862
 863
 864
 865
 866
 867
 868
 869
 870
 871
 872
 873
 874
 875
 876
 877
 878
 879
 880
 881
 882
 883
 884
 885
 886
 887
 888
 889
 890
 891
 892
 893
 894
 895
 896
 897
 898
 899
 900
 901
 902
 903
 904
 905
 906
 907
 908
 909
 910
 911
 912
 913
 914
 915
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
<?php
// Project: Web Reference Database (refbase) <http://www.refbase.net>
// Copyright: Matthias Steffens <mailto:refbase@extracts.de> and the file's
// original author(s).
//
// This code is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY. Please see the GNU General Public
// License for more details.
//
// File: ./includes/import.inc.php
// Repository: $HeadURL: file:///nfs/sf-svn/r/re/refbase/code/branches/bleeding-edge/includes/import.inc.php $
// Author(s): Matthias Steffens <mailto:refbase@extracts.de>
//
// Created: 13-Jan-06, 21:00
// Modified: $Date: 2013-04-29 21:45:09 +0000 (Mon, 29 Apr 2013) $
// $Author: msteffens $
// $Revision: 1377 $
// This file contains functions
// that are used when importing
// records into the database.
// TODO: I18n
// Import the ActiveLink Packages
require_once("classes/include.php");
import("org.active-link.xml.XML");
import("org.active-link.xml.XMLDocument");
include 'includes/transtab_bibtex_refbase.inc.php'; // include BibTeX markup -> refbase search & replace patterns
include 'includes/transtab_endnotexml_refbase.inc.php'; // include Endnote XML text style markup -> refbase search & replace patterns
if ($contentTypeCharset == "UTF-8") // variable '$contentTypeCharset' is defined in 'ini.inc.php'
include_once 'includes/transtab_latex_unicode.inc.php'; // include LaTeX -> Unicode translation table
else // we assume "ISO-8859-1" by default
include_once 'includes/transtab_latex_latin1.inc.php'; // include LaTeX -> Latin1 translation table
include_once('includes/classes/org/simplepie/SimplePie.compiled.php'); // include the SimplePie library
// --------------------------------------------------------------------
// ISI TO CSA
// This function converts records from "ISI Web of Science" format to "CSA" format
// in order to enable import of ISI WoS records via the 'csaToRefbase()' function.
// ISI WoS records must contain at least the tags "PT" and "SO" and end with "\nER\n".
//
// Authors: this function was originally written by Joachim Almergren <joachim.almergren@umb.no>
// and was re-written by Matthias Steffens <mailto:refbase@extracts.de> to enable batch import
function isiToCsa($isiSourceData)
{
global $alnum, $alpha, $cntrl, $dash, $digit, $graph, $lower, $print, $punct, $space, $upper, $word, $patternModifiers; // defined in 'transtab_unicode_charset.inc.php' and 'transtab_latin1_charset.inc.php'
// Function preferences:
$extractAllAddresses = false; // if set to 'true', all addresses will be extracted from the ISI "C1" field;
// set to 'false' if you only want to extract the first address given in the ISI "C1" field
$extractEmail = true; // if set to 'true', the first email address will be extracted from the ISI "EM" field and appended to the first address in "AF: Affiliation";
// set to 'false' if you don't want to extract the email address
// Generate an array which lists all the CSA tags that are recognized by the 'csaToRefbase()' function
// and match them with their corresponding ISI tags ("CSA tag" => "ISI tag"):
$isiToCsaTagsArray = array(
"PT: Publication Type" => "PT",
"AU: Author" => "AU",
"TI: Title" => "TI",
"SO: Source" => "SO",
"PY: Publication Year" => "PY",
// "JN: Journal Name" => "", // the 'csaToRefbase()' function will generate the full journal name from "SO: Source"
"JA: Abbrev Journal Name" => "JI",
// "MT: Monograph Title" => "", // the ISI WoS database does only contain journal article (AFAIK)
"JV: Journal Volume" => "VL",
"JI: Journal Issue" => "IS",
// "JP: Journal Pages" => "", // ISI WoS contains separate tags for start page ("BP") and end page ("EP"), we'll compute a "JP: Journal Pages" from these fields below
"AF: Affiliation" => "C1",
// "CA: Corporate Author" => "",
"DE: Descriptors" => "DE",
"AB: Abstract" => "AB",
"PB: Publisher" => "PU",
// "" => "PI", // AFAIK, CSA offers no field for the place of publisher (though it would be nice to import this info as well)
// "ED: Editor" => "",
"LA: Language" => "LA",
// "SL: Summary Language" => "",
// "OT: Original Title" => "",
"IS: ISSN" => "SN",
// "IB: ISBN" => "",
// "ER: Environmental Regime" => "",
// "CF: Conference" => "",
"NT: Notes" => "UT", // we'll import the ISI record ID to the notes field
// "DO: DOI" => "DI", // Bibutils apparently recognizes "DI" to extract the DOI from ISI records, however, my tests returned only ISI records where the "DI" field contained some other identifier ?:-/
);
// ----------------------------------------------------------------
// SPLIT INPUT text on the "ER" (= end of record) tag that terminates every ISI record:
$isiRecordsArray = preg_split("/\s*[\r\n]ER *[\r\n]\s*/", $isiSourceData, -1, PREG_SPLIT_NO_EMPTY); // (the 'PREG_SPLIT_NO_EMPTY' flag causes only non-empty pieces to be returned)
$recordsCount = count($isiRecordsArray); // count how many records are available
$csaRecordsArray = array(); // initialize array variable which will hold all records that were converted to CSA format
// ----------------------------------------------------------------
// LOOP OVER EACH RECORD:
for ($i=0; $i<$recordsCount; $i++) // for each record...
{
// we'll only process an array element if it's text does contain the "PT" tag as well as the "SO" tag:
if ((preg_match("/^PT /m", $isiRecordsArray[$i])) AND (preg_match("/^SO /m", $isiRecordsArray[$i]))) // ...process this record:
{
$csaRecordFieldsArray = array(); // initialize array variable which will hold all fields that we've converted to CSA format
// extract first email address from ISI "EM" field:
if (preg_match("/^EM [^ \r\n]+/m", $isiRecordsArray[$i]))
$emailAddress = preg_replace("/.*[\r\n]EM ([^ \r\n]+).*/s", "\\1", $isiRecordsArray[$i]);
else
$emailAddress = "";
// extract start page (ISI "BP" field) and end page (ISI "EP" field):
$pages = array();
if (preg_match("/^BP [^ \r\n]+/m", $isiRecordsArray[$i]))
$pages[] = preg_replace("/.*[\r\n]BP ([^\r\n]+).*/s", "\\1", $isiRecordsArray[$i]);
if (preg_match("/^EP [^ \r\n]+/m", $isiRecordsArray[$i]))
$pages[] = preg_replace("/.*[\r\n]EP ([^\r\n]+).*/s", "\\1", $isiRecordsArray[$i]);
if (!empty($pages))
$pageRange = implode("-", $pages);
// if no start or end page is given, we'll try the ISI "PG" field that indicates the total number of pages:
elseif (preg_match("/^PG [^ \r\n]+/m", $isiRecordsArray[$i]))
$pageRange = preg_replace("/.*[\r\n]PG ([^\r\n]+).*/s", "\\1 pp", $isiRecordsArray[$i]);
else
$pageRange = "";
// split each record into its individual fields:
$isiRecordFieldsArray = preg_split("/[\r\n]+(?=\w\w )/", $isiRecordsArray[$i]);
// LOOP OVER EACH FIELD:
foreach ($isiRecordFieldsArray as $recordField)
{
// we'll only process an array element if it starts with two letters followed by a space
if (preg_match("/^\w\w /", $recordField))
{
// split each field into its tag and its field data:
list($recordFieldTag, $recordFieldData) = preg_split("/(?<=^\w\w) /", $recordField);
foreach ($isiToCsaTagsArray as $csaTag => $isiTag) // for each ISI field that we'd like to convert...
{
if ($recordFieldTag == $isiTag)
{
// replace found ISI field identifier tag with the corresponding CSA tag:
$recordFieldTag = $csaTag;
// add a space to the beginning of any data line that starts with only three spaces (instead of four):
$recordFieldData = preg_replace("/^ (?! )/m", " ", $recordFieldData);
// convert ISI publication type "J" into CSA format ("Journal Article"):
if (($recordFieldTag == "PT: Publication Type") AND ($recordFieldData == "J"))
$recordFieldData = "Journal Article";
// merge multiple authors (that are printed on separate lines) with a semicolon (';') and a space:
elseif ($recordFieldTag == "AU: Author")
$recordFieldData = preg_replace("/\s*[\r\n]\s*/", "; ", $recordFieldData);
// process address info:
elseif ($recordFieldTag == "AF: Affiliation")
{
// remove any trailing punctuation from end of string:
$recordFieldData = preg_replace("/[$punct]+$/$patternModifiers", "", $recordFieldData);
$recordFieldDataArray = array(); // initialize array variable
// if the address data string contains multiple addresses (which are given as one address per line):
if (preg_match("/[\r\n]/", $recordFieldData))
// split address data string into individual addresses:
$recordFieldDataArray = preg_split("/[$punct\s]*[\r\n]\s*/$patternModifiers", $recordFieldData);
else
// use the single address as given:
$recordFieldDataArray[] = $recordFieldData;
// append the first email address from ISI "EM" field to the first address in "AF: Affiliation":
if (($extractEmail) AND (!empty($emailAddress)))
$recordFieldDataArray[0] .= ", Email: " . $emailAddress;
if ($extractAllAddresses)
// merge multiple addresses with a semicolon (';') and a space:
$recordFieldData = implode("; ", $recordFieldDataArray);
else
// use only the first address in "AF: Affiliation":
$recordFieldData = $recordFieldDataArray[0];
}
// if a comma (',') is used as keyword delimiter, we'll convert it into a semicolon (';'):
elseif (($recordFieldTag == "DE: Descriptors") AND (!preg_match("/;/", $recordFieldData)))
$recordFieldData = preg_replace("/ *, */", "; ", $recordFieldData);
// if all of the record data is in uppercase letters, we attempt to convert the string to something more readable:
if ((preg_match("/^[$upper\W\d]+$/$patternModifiers", $recordFieldData)) AND ($isiTag != "UT")) // we exclude the ISI record ID from the ISI "UT" field
// convert upper case to title case (converts e.g. "ELSEVIER SCIENCE BV" into "Elsevier Science Bv"):
// (note that this case transformation won't do the right thing for author initials and abbreviations,
// but the result is better than the whole string being upper case, IMHO)
$recordFieldData = changeCase('title', $recordFieldData); // function 'changeCase()' is defined in 'include.inc.php'
// merge again field tag and data:
$recordField = $recordFieldTag . "\n " . $recordFieldData;
// append this field to array of CSA fields:
$csaRecordFieldsArray[] = $recordField;
// process next ISI field in '$isiRecordFieldsArray':
continue;
}
}
}
}
// append "JP: Journal Pages" field with generated page range to array of CSA fields:
if (!empty($pageRange))
$csaRecordFieldsArray[] = "JP: Journal Pages\n " . $pageRange;
// merge CSA fields into a string and prefix it with a CSA record identifier:
$csaRecord = "Record " . ($i + 1) . " of " . $recordsCount . "\n\n" . implode("\n", $csaRecordFieldsArray);
// append this record to array of CSA records:
$csaRecordsArray[] = $csaRecord;
}
}
// return all CSA records merged into a string:
return implode("\n\n", $csaRecordsArray);
}
// --------------------------------------------------------------------
// CROSSREF TO REFBASE
// This function converts records from CrossRef's "unixref" XML format into the standard "refbase"
// array format which can be then imported by the 'addRecords()' function in 'include.inc.php'.
//
// NOTES: - So far, CrossRef seems to be the only provider of this data format & they do not yet
// use it to return more than one result. However, since function 'fetchDataFromCrossRef()'
// can fetch data for multiple DOIs and appends found records to '$sourceText', this
// function does allow for batch import (though we might need to tweak '$recordDelimiter'
// if multiple results would be returned directly by CrossRef).
//
// - This is our first and only native XML import format, so we do not use functions
// 'validateRecords()' or 'parseRecords()'.
//
// TODO:
// - one of these, in order of preference:
// - change functions 'validateRecords()' & 'parseRecords()' to accept non-tagged, XML references
// - add new XML validation/parsing functions
// - transform XML to a tagged format
//
// Authors: Richard Karnesky <mailto:karnesky@gmail.com> and
// Matthias Steffens <mailto:refbase@extracts.de>
function crossrefToRefbase($sourceText, $importRecordsRadio, $importRecordNumbersArray)
{
global $alnum, $alpha, $cntrl, $dash, $digit, $graph, $lower, $print, $punct, $space, $upper, $word, $patternModifiers; // defined in 'transtab_unicode_charset.inc.php' and 'transtab_latin1_charset.inc.php'
global $contentTypeCharset; // defined in 'ini.inc.php'
global $errors;
global $showSource;
// Pattern by which the input text will be split into individual records:
$recordDelimiter = "(\s*(<\?xml[^<>]*\?>\s*)?<doi_records[^<>]*>)?\s*(?=<doi_record[^<>]*>)" // splits before '<doi_record>'
. "|"
. "(?<=<\/doi_record>)\s*(<\/doi_records>\s*)?"; // splits after '</doi_record>'
// Pattern by which multiple persons are separated within the author, editor or series editor fields of the source data:
// (Notes: - name standardization occurs after multiple author fields have been merged by '; '
// - the split pattern must be specified as perl-style regular expression (including the leading & trailing
// slashes) and may include mode modifiers (such as '/.../i' to perform a case insensitive match))
$personDelimiter = "/ *; */";
// Pattern by which a person's family name is separated from the given name (or initials):
// (the split pattern must be specified as perl-style regular expression (including the leading & trailing
// slashes) and may include mode modifiers (such as '/.../i' to perform a case insensitive match))
$familyNameGivenNameDelimiter = "/ *, */";
// Specifies whether the person's family name comes first within a person's name
// ('true' means that the family name is followed by the given name (or initials), 'false' means that the person's family name comes *after* the given name (or initials))
$familyNameFirst = true;
// Specifies whether a person's full given name(s) shall be shortened to initial(s):
// (Notes: - if set to 'true', given names will be abbreviated and initials will get normalized (meaning removal of extra whitespace, adding of dots between initials, etc)
// - if set to 'false', given names (and any initials) are taken as is
// - in your database, you should stick to either fully written given names OR initials; if you mix these, records won't get sorted correctly on citation output)
$shortenGivenNames = true;
// Specifies whether fields whose contents are entirely in upper case shall be transformed to title case ('true') or not ('false'):
$transformCase = true;
// Postprocessor actions:
// Defines search & replace 'actions' that will be applied to all those refbase fields that are listed in the corresponding 'fields' element:
// (If you don't want to perform any search and replace actions, specify an empty array, like: '$postprocessorActionsArray = array();'.
// Note that, in this case, the search patterns MUST include the leading & trailing slashes -- which is done to allow for mode modifiers such as 'imsxU'.)
// "/Search Pattern/" => "Replace Pattern"
$postprocessorActionsArray = array(
array(
'fields' => array("title"),
'actions' => array(
"/[,.;:!] *$/" => "" // remove any punctuation (except for question marks) from end of field contents
)
),
array(
'fields' => array("author", "title", "publication", "abbrev_journal"), // convert HTML font attributes (which some publishers include in their CrossRef data, see e.g. doi:10.1515/BOT.2001.002)
'actions' => array(
"/<sup>(.+?)<\/sup>/i" => "[super:\\1]", // replace '<sup>...</sup>' with refbase markup ('[super:...]')
"/<sub>(.+?)<\/sub>/i" => "[sub:\\1]", // replace '<sub>...</sub>' with refbase markup ('[sub:...]')
"/<i>(.+?)<\/i>/i" => "_\\1_", // replace '<i>...</i>' with refbase markup ('_..._')
"/<b>(.+?)<\/b>/i" => "**\\1**" // replace '<b>...</b>' with refbase markup ('**...**')
)
),
array(
'fields' => array("author", "title", "year", "publication", "abbrev_journal", "volume", "issue", "pages", "issn", "url", "doi"), // not sure whether we need to do this for all fields, it occurs e.g. for doi:10.1007/BF00391383 in the 'url' field
'actions' => array(
"/<!\[CDATA\[(.+?)\]\]>/" => "\\1" // remove any '<![CDATA[...]]>' wrapper
)
)
);
// -----------------------------------------
// PRE-PROCESS SOURCE TEXT:
// Split input text into individual records:
$recordArray = splitSourceText($sourceText, $recordDelimiter, false);
// PROCESS SOURCE DATA:
// Initialize array variables:
$parsedRecordsArray = array(); // initialize array variable which will hold parsed data of all records that shall be imported
// NOTE: We do NOT validate records yet, i.e. we assume that they are perfect and attempt to import all of them:
$importRecordNumbersRecognizedFormatArray = array(); // initialize array variable which will hold all record numbers of those records that shall be imported AND which were of a recognized format
$importRecordNumbersNotRecognizedFormatArray = array(); // same for all records that shall be imported BUT which had an UNrecognized format
$recordsCount = count($recordArray); // count how many records are available
// -----------------------------------------
// LOOP OVER EACH RECORD:
for ($i=0; $i<$recordsCount; $i++) // for each record...
{
$fieldParametersArray = array(); // setup an empty array (it will hold all fields that were extracted for a given record)
// Parse record XML:
$XML = new XML($recordArray[$i]);
// Check for any errors:
$errorXML = $XML->getBranches("doi_record/crossref","error");
if (!empty($errorXML[0]))
{
$importRecordNumbersNotRecognizedFormatArray[] = $i + 1; // append this record number to the list of numbers whose record format is NOT recognized
$crossRefError = $errorXML[0]->getTagContent("error"); // e.g. "DOI not found in CrossRef"
// Prepare an appropriate error message:
$errorMessage = "Record " . ($i + 1) . ": " . $crossRefError . "!";
if (!isset($errors["sourceText"]))
$errors["sourceText"] = $errorMessage;
else
$errors["sourceText"] = $errors["sourceText"] . "<br>" . $errorMessage;
}
else // a DOI record was found
{
// NOTE: We do NOT yet validate any found records, i.e. for now, we'll just assume that they are ok:
$importRecordNumbersRecognizedFormatArray[] = $i + 1; // append this record number to the list of numbers whose record format IS recognized ('$i' starts with 0 so we have to add 1 to point to the correct record number)
$fieldParametersArray['type'] = 'Journal Article'; // MOST CrossRef entitites are journal articles. TODO: find what isn't & fix the type
// Parse main XML branches:
$metadataXML = $XML->getBranches("doi_record/crossref/journal","journal_metadata");
$issueXML = $XML->getBranches("doi_record/crossref/journal","journal_issue");
$articleXML = $XML->getBranches("doi_record/crossref/journal","journal_article");
$contributorsXML = $XML->getBranches("doi_record/crossref/journal/journal_article/contributors","person_name");
// Process '$metadataXML':
// TODO:
// - Put CODEN and/or publisher-specific article IDs ('<publisher_item>') into the 'notes' field (?)
$fieldParametersArray['publication'] = $metadataXML[0]->getTagContent("journal_metadata/full_title");
$fieldParametersArray['abbrev_journal'] = $metadataXML[0]->getTagContent("journal_metadata/abbrev_title");
// Get print ISSN ('media_type="print"')
$issnXML = $XML->getBranches("doi_record/crossref/journal/journal_metadata","issn","media_type","print");
if (!empty($issnXML[0]))
$issn = $issnXML[0]->getTagContent("issn");
else // if there's no ISSN tag with attribute 'media_type="print"', we fall back to the first given ISSN tag (if any)
{
$issnXML = $XML->getBranches("doi_record/crossref/journal/journal_metadata","issn");
if (!empty($issnXML[0]))
$issn = $issnXML[0]->getTagContent("issn");
else
$issn = "";
}
if (!empty($issn))
$fieldParametersArray['issn'] = $issn;
// Process '$issueXML':
$fieldParametersArray['year'] = $issueXML[0]->getTagContent("journal_issue/publication_date/year");
$fieldParametersArray['volume'] = $issueXML[0]->getTagContent("journal_issue/journal_volume/volume");
$fieldParametersArray['issue'] = $issueXML[0]->getTagContent("journal_issue/issue");
// Proccess '$articleXML':
$fieldParametersArray['title'] = $articleXML[0]->getTagContent("journal_article/titles/title");
// - Append any subtitle to the main title:
$subTitleXML = $articleXML[0]->getBranches("journal_article/titles","subtitle");
if (!empty($subTitleXML[0]))
$fieldParametersArray['title'] .= ": " . $subTitleXML[0]->getTagContent("subtitle");
$fieldParametersArray['doi'] = $articleXML[0]->getTagContent("journal_article/doi_data/doi");
$fieldParametersArray['url'] = $articleXML[0]->getTagContent("journal_article/doi_data/resource");
$fieldParametersArray['startPage'] = $articleXML[0]->getTagContent("journal_article/pages/first_page");
$fieldParametersArray['endPage'] = $articleXML[0]->getTagContent("journal_article/pages/last_page");
// Process '$contributorsXML':
// TODO:
// - Differentiate authors from other types of contributors
$author = "";
foreach ($contributorsXML as $contributor)
{
$givenName = $contributor->getTagContent("person_name/given_name");
$familyName = $contributor->getTagContent("person_name/surname");
// If the author's family (or given) name is entirely in uppercase letters, we attempt to convert the string to something more readable:
if ($transformCase)
{
if (preg_match("/^[$upper\W\d]+$/$patternModifiers", $familyName))
// Convert upper case to title case (converts e.g. "STEFFENS" into "Steffens"):
$familyName = changeCase('title', $familyName); // function 'changeCase()' is defined in 'include.inc.php'
if (preg_match("/^[$upper\W\d]+$/$patternModifiers", $givenName))
// Convert upper case to title case (converts e.g. "MATTHIAS" into "Matthias"):
$givenName = changeCase('title', $givenName);
}
// Append any name suffix to the surname:
$nameSuffixXML = $contributor->getBranches("person_name","suffix");
if (!empty($nameSuffixXML[0]))
$familyName .= " " . $nameSuffixXML[0]->getTagContent("suffix");
$author .= $familyName . ", " . $givenName . "; ";
}
$author = trim($author, "; ");
$fieldParametersArray['author'] = $author;
// Standardize field data contained in '$fieldParametersArray':
foreach ($fieldParametersArray as $fieldKey => $fieldData)
{
// Decode HTML special chars:
if (($fieldKey != "url") AND preg_match('/&(amp|quot|#0?39|lt|gt);/', $fieldData))
$fieldParametersArray[$fieldKey] = decodeHTMLspecialchars($fieldData); // function 'decodeHTMLspecialchars()' is defined in 'include.inc.php'
elseif (($fieldKey == "url") AND preg_match('/&amp;/', $fieldData)) // in case of the 'url' field, we just decode any ampersand characters
$fieldParametersArray[$fieldKey] = str_replace('&amp;', '&', $fieldData);
}
// Function 'standardizeFieldData()' e.g. performs case transformation, standardizes thesis names, normalizes page ranges, and reformats person names according to preference:
$fieldParametersArray = standardizeFieldData($fieldParametersArray, "CrossRef XML", $personDelimiter, $familyNameGivenNameDelimiter, $familyNameFirst, $shortenGivenNames, $transformCase, $postprocessorActionsArray);
// Append the array of extracted field data to the main data array which holds all records to import:
$parsedRecordsArray[] = $fieldParametersArray;
}
}
// -----------------------------------------
// Build refbase import array:
$importDataArray = buildImportArray("refbase", // 'type' - the array format of the 'records' element
"1.0", // 'version' - the version of the given array structure
"http://refbase.net/import/crossref/", // 'creator' - the name of the script/importer (preferably given as unique URI)
"Richard Karnesky", // 'author' - author/contact name of the person who's responsible for this script/importer
"karnesky@users.sourceforge.net", // 'contact' - author's email/contact address
array('prefix_call_number' => "true"), // 'options' - array with settings that control the behaviour of the 'addRecords()' function
$parsedRecordsArray); // 'records' - array of record(s) (with each record being a sub-array of fields)
return array($importDataArray, $recordsCount, $importRecordNumbersRecognizedFormatArray, $importRecordNumbersNotRecognizedFormatArray, $errors);
}
// --------------------------------------------------------------------
// ARXIV TO REFBASE
// This function converts records from arXiv.org's Atom XML Opensearch format into the standard "refbase"
// array format which can be then imported by the 'addRecords()' function in 'include.inc.php'.
// Info on the arXiv API (including sample code) is given at:
// <http://export.arxiv.org/api_help/>
// <http://export.arxiv.org/api_help/docs/user-manual.html>
// <http://export.arxiv.org/api_help/docs/examples/php_arXiv_parsing_example.txt>
//
// Requires the SimplePie library (by Ryan Parman and Geoffrey Sneddon), which is
// available under the BSD license from: <http://simplepie.org>
//
// '$feed' must contain the list of Atom feed items given as a SimplePie object
function arxivToRefbase(&$feed, $importRecordsRadio, $importRecordNumbersArray)
{
global $alnum, $alpha, $cntrl, $dash, $digit, $graph, $lower, $print, $punct, $space, $upper, $word, $patternModifiers; // defined in 'transtab_unicode_charset.inc.php' and 'transtab_latin1_charset.inc.php'
global $contentTypeCharset; // defined in 'ini.inc.php'
global $errors;
global $showSource;
// Pattern by which multiple persons are separated within the author, editor or series editor fields of the source data:
// (Notes: - name standardization occurs after multiple author fields have been merged by '; '
// - the split pattern must be specified as perl-style regular expression (including the leading & trailing
// slashes) and may include mode modifiers (such as '/.../i' to perform a case insensitive match))
$personDelimiter = "/ *; */";
// Pattern by which a person's family name is separated from the given name (or initials):
// (the split pattern must be specified as perl-style regular expression (including the leading & trailing
// slashes) and may include mode modifiers (such as '/.../i' to perform a case insensitive match))
$familyNameGivenNameDelimiter = "/ (?=([$upper]+[-$alpha]+)( *;|$))/$patternModifiers";
// Specifies whether the person's family name comes first within a person's name
// ('true' means that the family name is followed by the given name (or initials), 'false' means that the person's family name comes *after* the given name (or initials))
$familyNameFirst = false;
// Specifies whether a person's full given name(s) shall be shortened to initial(s):
// (Notes: - if set to 'true', given names will be abbreviated and initials will get normalized (meaning removal of extra whitespace, adding of dots between initials, etc)
// - if set to 'false', given names (and any initials) are taken as is
// - in your database, you should stick to either fully written given names OR initials; if you mix these, records won't get sorted correctly on citation output)
$shortenGivenNames = true;
// Specifies whether fields whose contents are entirely in upper case shall be transformed to title case ('true') or not ('false'):
$transformCase = true;
// Postprocessor actions:
// Defines search & replace 'actions' that will be applied to all those refbase fields that are listed in the corresponding 'fields' element:
// (If you don't want to perform any search and replace actions, specify an empty array, like: '$postprocessorActionsArray = array();'.
// Note that, in this case, the search patterns MUST include the leading & trailing slashes -- which is done to allow for mode modifiers such as 'imsxU'.)
// "/Search Pattern/" => "Replace Pattern"
$postprocessorActionsArray = array(
array(
'fields' => array("title", "abstract", "notes"),
'actions' => array(
"/ *[\n\r]+ */" => " " // transform whitespace: replace any run of whitespace that includes newline/return character(s) with a space
)
),
array(
'fields' => array("title"),
'actions' => array(
"/[,.;:!] *$/" => "" // remove any punctuation (except for question marks) from end of field contents
)
)
);
// -----------------------------------------
// PROCESS SOURCE DATA:
// Initialize array variables:
$parsedRecordsArray = array(); // initialize array variable which will hold parsed data of all records that shall be imported
// NOTE: We do NOT validate records yet, i.e. we assume that they are perfect and attempt to import all of them:
$importRecordNumbersRecognizedFormatArray = array(); // initialize array variable which will hold all record numbers of those records that shall be imported AND which were of a recognized format
$importRecordNumbersNotRecognizedFormatArray = array(); // same for all records that shall be imported BUT which had an UNrecognized format
// Use these namespaces to retrieve tags:
$atomNamespace = 'http://www.w3.org/2005/Atom';
$opensearchNamespace = 'http://a9.com/-/spec/opensearch/1.1/';
$arxivNamespace = 'http://arxiv.org/schemas/atom';
// Get feed data:
$recordArray = $feed->get_items(); // fetch all feed items into an array
$recordsCount = count($recordArray); // count how many records are available
// -----------------------------------------
// LOOP OVER EACH RECORD:
for ($i=0; $i<$recordsCount; $i++) // for each record...
{
$fieldParametersArray = array(); // setup an empty array (it will hold all fields that were extracted for a given record)
$record = $recordArray[$i]; // this will make things a bit more readable
// Check for any errors:
if ($record->get_title() == "Error")
{
$importRecordNumbersNotRecognizedFormatArray[] = $i + 1; // append this record number to the list of numbers whose record format is NOT recognized
$arXivError = $record->get_description(); // e.g. "incorrect id format for 1234.12345"
// Prepare an appropriate error message:
$errorMessage = "Record " . ($i + 1) . ": " . $arXivError . "!";
if (!isset($errors["sourceText"]))
$errors["sourceText"] = $errorMessage;
else
$errors["sourceText"] = $errors["sourceText"] . "<br>" . $errorMessage;
}
elseif (!($record->get_permalink())) // empty record (which has no matching arXiv ID); ATM, this occurs e.g. when querying for "arXiv:1234.9999"
{
$importRecordNumbersNotRecognizedFormatArray[] = $i + 1; // append this record number to the list of numbers whose record format is NOT recognized
// Prepare an appropriate error message:
$errorMessage = "Record " . ($i + 1) . ": nothing found!";
if (!isset($errors["sourceText"]))
$errors["sourceText"] = $errorMessage;
else
$errors["sourceText"] = $errors["sourceText"] . "<br>" . $errorMessage;
}
else // an arXiv record was found
{
// NOTE: We do NOT yet validate any found records, i.e. for now, we'll just assume that they are ok:
$importRecordNumbersRecognizedFormatArray[] = $i + 1; // append this record number to the list of numbers whose record format IS recognized ('$i' starts with 0 so we have to add 1 to point to the correct record number)
// Extract elements of the current Atom XML entry:
// - type:
$fieldParametersArray['type'] = 'Journal Article'; // NOTE: Are all arXiv records journal articles? TODO: find what isn't & fix the type
// - id:
$fieldParametersArray['notes'] = str_replace("http://arxiv.org/abs/", "arXiv:", $record->get_permalink()); // extract the arXiv ID from the abstract URL in the 'id' element & prefix it with "arXiv:"
// - title:
$fieldParametersArray['title'] = $record->get_title();
// - summary:
if ($abstract = $record->get_description())
$fieldParametersArray['abstract'] = $abstract;
// - author:
// NOTE: If we didn't want to extract author affiliation info, we could just use standard SimplePie functions ('get_authors()' and 'get_name()')
$authorsArray = array();
$addressArray = array();
$authors = $record->get_item_tags($atomNamespace, 'author');
foreach ($authors as $author)
{
$authorName = "";
$authorLastName = "";
$authorAddressArray = "";
if (isset($author['child'][$atomNamespace]['name']) AND ($authorName = $author['child'][$atomNamespace]['name'][0]['data']))
{
// -- name:
// In case of a latin1-based database, attempt to convert UTF-8 data to refbase markup & latin1:
// NOTE: For authors, we need to perform charset conversion up here (and not further down below, as is done for all the other fields),
// since otherwise the below '$upper' and '$alpha' character class elements would fail to match!
if (($contentTypeCharset == "ISO-8859-1") AND (detectCharacterEncoding($authorName) == "UTF-8")) // function 'detectCharacterEncoding()' is defined in 'include.inc.php'
$authorName = convertToCharacterEncoding("ISO-8859-1", "TRANSLIT", $authorName, "UTF-8"); // function 'convertToCharacterEncoding()' is defined in 'include.inc.php'
// Change the formatting of author names to the one used by refbase, i.e. the family name comes first, and a comma separates family name & initials:
// (further standardisation of person names is done in function 'standardizeFieldData()'; see also note for '$familyNameGivenNameDelimiter' above)
// NOTE: With the above settings for '$familyNameGivenNameDelimiter' and '$familyNameFirst' this isn't necessary anymore
// $authorName = preg_replace("/^(.+?) +([$upper]+[-$alpha]+)$/$patternModifiers", "\\2, \\1", $authorName);
$authorsArray[] = $authorName;
// -- arxiv:affiliation:
if (isset($author['child'][$arxivNamespace]) AND ($authorAffiliations = $author['child'][$arxivNamespace]['affiliation']))
{
foreach ($authorAffiliations as $authorAffiliation)
$authorAddressArray[] = $authorAffiliation['data'];
$authorAddresses = implode(", ", $authorAddressArray);
// In case of a latin1-based database, attempt to convert UTF-8 data to refbase markup & latin1:
if (($contentTypeCharset == "ISO-8859-1") AND (detectCharacterEncoding($authorAddresses) == "UTF-8"))
$authorAddresses = convertToCharacterEncoding("ISO-8859-1", "TRANSLIT", $authorAddresses, "UTF-8");
$authorLastName = preg_replace("/^([$upper]+[-$alpha]+).+$/$patternModifiers", "\\1", $authorName); // extract authors last name
$addressArray[] = $authorLastName . ": " . $authorAddresses;
}
}
}
if (!empty($authorsArray))
$fieldParametersArray['author'] = implode("; ", $authorsArray); // merge multiple authors
if (!empty($addressArray))
$fieldParametersArray['address'] = implode("; ", $addressArray); // merge multiple author affiliations
// - links:
//
// TODO: Currently, we just copy a link to the PDF to the 'file' field. It might be desirable to fetch the actual PDF and store it on the refbase server.
//
// NOTE: - In order to extract any links, we access the raw SimplePie object here; This is done since, in SimplePie v1.1.1, the standard SimplePie functions
// 'get_link()' and 'get_links()' only support checking for the 'rel' attribute, but don't allow to filter on the 'type' or 'title' attribute. However,
// we need to check the 'type' & 'title' attributes in order to assign PDF & DOI links to the 'file' & 'doi' fields, respectively. Alternatively, we
// could also get this information from the URL itself, but that may fail if arXiv changes its URL pattern.
// - More info on how to grab custom tags or attributes: <http://simplepie.org/wiki/tutorial/grab_custom_tags_or_attributes>
$links = $record->get_item_tags($atomNamespace, 'link');
foreach ($links as $link)
{
if (isset($link['attribs']['']['href']))
{
// -- file:
if (!isset($fieldParametersArray['file']) AND isset($link['attribs']['']['title']) AND ($link['attribs']['']['title'] == "pdf")) // we could also check for 'type="application/pdf"'
$fieldParametersArray['file'] = $link['attribs']['']['href'];
// -- url:
// NOTE: the 'id' element (see above) also contains the URL to the abstract page for the current article
elseif (!isset($fieldParametersArray['url']) AND isset($link['attribs']['']['type']) AND ($link['attribs']['']['type'] == "text/html")) // we could also check for 'title' being unset
$fieldParametersArray['url'] = $link['attribs']['']['href'];
// -- doi:
// NOTE: the 'arxiv:doi' element also contains the DOI for the current article
elseif (!isset($fieldParametersArray['doi']) AND isset($link['attribs']['']['title']) AND ($link['attribs']['']['title'] == "doi"))
$fieldParametersArray['doi'] = str_replace("http://dx.doi.org/", "", $link['attribs']['']['href']);
}
}
// - arxiv:comment:
if ($comment = $record->get_item_tags($arxivNamespace, 'comment'))
$fieldParametersArray['notes'] .= "; " . $comment[0]['data']; // TODO: if arXiv records can include multiple comments, we'd need to loop over all of them
// - arxiv:primary_category:
// TODO: Should we copy the term given in the 'arxiv:primary_category' element to the 'area' field?
// - arxiv:category:
$categoriesArray = array();
foreach ($record->get_categories() as $category)
$categoriesArray[] = $category->get_label();
if (!empty($categoriesArray))
$fieldParametersArray['keywords'] = implode("; ", $categoriesArray); // merge multiple categories
// - arxiv:journal_ref:
if ($journalRef = $record->get_item_tags($arxivNamespace, 'journal_ref'))
{
// We extract the full 'journal_ref' string into its own variable since we're going to mess with it:
$journalRefData = preg_replace("/ *[\n\r]+ */", " ", $journalRef[0]['data']); // transform whitespace: replace any run of whitespace that includes newline/return character(s) with a space
// NOTE: The formatting of the 'journal_ref' string can vary heavily, so
// the below parsing efforts may fail. Therefore, we'll also copy the
// original 'journal_ref' string to the 'notes' field, and display it
// in the header message when importing single records.
$fieldParametersArray['source'] = $journalRefData;
$fieldParametersArray['notes'] .= "; Journal Ref: " . $journalRefData;
// Extract source info from the 'journal_ref' string into the different fields:
// NOTE: We try to use reasonably liberal (and thus rather ugly!) regex patterns
// which should catch most of the commonly used formatting styles. However,
// as noted above, due to the varying formatting of the 'journal_ref' string,
// this may not be always entirely successful.
// TODO: Extract ISSN from the 'journal_ref' string (see e.g. 'arXiv:cond-mat/0506611v1')
// -- journal:
$journalName = preg_replace("/^(.+?)(?= *(\(?\d+|[,;]|(v(ol)?\.?|volume) *\d+|$)).*/i", "\\1", $journalRefData); // extract journal name
$journalRefData = preg_replace("/^(.+?)(?= *(\(?\d+|[,;]|(v(ol)?\.?|volume) *\d+|$))[,; ]*/i", "", $journalRefData); // remove journal name from 'journal_ref' string
if (preg_match("/\./", $journalName))
$fieldParametersArray['abbrev_journal'] = preg_replace("/(?<=\.)(?![ )]|$)/", " ", $journalName); // ensure that any dots are followed with a space
else
$fieldParametersArray['publication'] = $journalName;
// -- volume:
// NOTE: The volume is assumed to be the first number that follows the journal name, and
// which is followed by another four-digit number (which is asssumed to be the year).
if (preg_match("/^(?:(?:v(?:ol)?\.?|volume) *)?(\w*\d+\w*)(?= *.*?\d{4})/i", $journalRefData))
{
$fieldParametersArray['volume'] = preg_replace("/^(?:(?:v(?:ol)?\.?|volume) *)?(\w*\d+\w*)(?= *.*?\d{4}).*/i", "\\1", $journalRefData); // extract volume
$journalRefData = preg_replace("/^(?:(?:v(?:ol)?\.?|volume) *)?(\w*\d+\w*)(?= *.*?\d{4})[,; ]*/i", "", $journalRefData); // remove volume from 'journal_ref' string
}
// -- year (take 1):
// NOTE: For the first take, we assume the year to be the first occurrence of a four-digit number
// that's wrapped in parentheses.
if (preg_match("/\(\d{4}\)/i", $journalRefData))
{
$fieldParametersArray['year'] = preg_replace("/^.*?\((\d{4})\).*?$/i", "\\1", $journalRefData); // extract year
$journalRefData = preg_replace("/[,; ]*\(\d{4}\)[,; ]*/i", " ", $journalRefData); // remove year from 'journal_ref' string
}
// -- issue:
// NOTE: The issue is only recognized if it is preceded with a "n/no/number" prefix, or if it is a
// number with less than four digits that is enclosed in parentheses (we can check for the latter
// case since four-digit years that are wrapped in parens have already been removed). The regex
// patterns below also try to account for some non-digit characters in issue numbers.
// TODO: Support issue extraction from "Journal Vol:No ..." format (see e.g. 'arXiv:cond-mat/0703452v2')
if (preg_match("/(?:(?:n\.|no\.?|number) *)(\w*[\d\/-]+\w*)|\((\w*(?:\d{1,3}|\d{1,2}[\/-]+\d{1,2})\w*)\)/i", $journalRefData)) // matches e.g. "no. 2", "Number 2" or "(1/2)"
{
$fieldParametersArray['issue'] = preg_replace("/^.*?(?:(?:(?:n\.|no\.?|number) *)(\w*[\d\/-]+\w*)|\((\w*(?:\d{1,3}|\d{1,2}[\/-]+\d{1,2})\w*)\)).*?$/i", "\\1\\2", $journalRefData); // extract issue
$journalRefData = preg_replace("/[,; ]*(?:(?:(?:n\.|no\.?|number) *)(\w*[\d\/-]+\w*)|\((\w*(?:\d{1,3}|\d{1,2}[\/-]+\d{1,2})\w*)\))[,; ]*/i", "", $journalRefData); // remove issue from 'journal_ref' string
}
// -- pages (take 1):
// NOTE: For the first take, we assume the pages to be either preceded with a "p/pp" prefix, or to
// be a page range.
if (preg_match("/(?:p(?:p)?\.? *)(\w*\d+\w*)(?: *-+ *(\w*\d+\w*))?|(?:p(?:p)?\.? *)?(\w*\d+\w*) *-+ *(\w*\d+\w*)/i", $journalRefData)) // matches e.g. "p. 167-188", "pp.361--364" or "197-209"
{
$fieldParametersArray['startPage'] = preg_replace("/^.*?(?:(?:p(?:p)?\.? *)(\w*\d+\w*)(?: *-+ *(\w*\d+\w*))?|(?:p(?:p)?\.? *)?(\w*\d+\w*) *-+ *(\w*\d+\w*)).*?$/i", "\\1\\3", $journalRefData); // extract starting page
$fieldParametersArray['endPage'] = preg_replace("/^.*?(?:(?:p(?:p)?\.? *)(\w*\d+\w*)(?: *-+ *(\w*\d+\w*))?|(?:p(?:p)?\.? *)?(\w*\d+\w*) *-+ *(\w*\d+\w*)).*?$/i", "\\2\\4", $journalRefData); // extract ending page
$journalRefData = preg_replace("/[,; ]*(?:(?:p(?:p)?\.? *)(\w*\d+\w*)(?: *-+ *(\w*\d+\w*))?|(?:p(?:p)?\.? *)?(\w*\d+\w*) *-+ *(\w*\d+\w*))[,; ]*/i", "", $journalRefData); // remove page info from 'journal_ref' string
}
// -- year (take 2):
// NOTE: For the second take, we assume the year to be the first occurrence of any four-digit number
// in the remaining 'journal_ref' string.
if (!isset($fieldParametersArray['year']) AND preg_match("/\b\d{4}\b/i", $journalRefData))
{
$fieldParametersArray['year'] = preg_replace("/^.*?\b(\d{4})\b.*?$/i", "\\1", $journalRefData); // extract year
$journalRefData = preg_replace("/[,; ]*\b\d{4}\b[,; ]*/i", " ", $journalRefData); // remove year from 'journal_ref' string
}
// -- pages (take 2):
// NOTE: For the second take, we assume the page info to be any number that is at the beginning of
// the remaining 'journal_ref' string.
if (!isset($fieldParametersArray['startPage']) AND preg_match("/^[,; ]*\w*\d+\w*/i", $journalRefData))
{
$fieldParametersArray['startPage'] = preg_replace("/^[,; ]*(\w*\d+\w*).*?$/i", "\\1", $journalRefData); // extract page info
}
}
// Standardize field data contained in '$fieldParametersArray':
foreach ($fieldParametersArray as $fieldKey => $fieldData)
{
// In case of a latin1-based database, attempt to convert UTF-8 data to refbase markup & latin1:
// (we exclude the 'author' and 'address' fields here since they have already been dealt with above)
if ((!preg_match("/^(author|address)$/", $fieldKey)) AND ($contentTypeCharset == "ISO-8859-1") AND (detectCharacterEncoding($fieldData) == "UTF-8"))
$fieldData = convertToCharacterEncoding("ISO-8859-1", "TRANSLIT", $fieldData, "UTF-8");
// Decode HTML special chars:
if (($fieldKey != "url") AND preg_match('/&(amp|quot|#0?39|lt|gt);/', $fieldData))
$fieldParametersArray[$fieldKey] = decodeHTMLspecialchars($fieldData); // function 'decodeHTMLspecialchars()' is defined in 'include.inc.php'
elseif (($fieldKey == "url") AND preg_match('/&amp;/', $fieldData)) // in case of the 'url' field, we just decode any ampersand characters
$fieldParametersArray[$fieldKey] = str_replace('&amp;', '&', $fieldData);
}
// Function 'standardizeFieldData()' e.g. performs case transformation, standardizes thesis names, normalizes page ranges, and reformats person names according to preference:
$fieldParametersArray = standardizeFieldData($fieldParametersArray, "arXiv XML", $personDelimiter, $familyNameGivenNameDelimiter, $familyNameFirst, $shortenGivenNames, $transformCase, $postprocessorActionsArray);
// Append the array of extracted field data to the main data array which holds all records to import:
$parsedRecordsArray[] = $fieldParametersArray;
}
}
// -----------------------------------------
// Build refbase import array:
$importDataArray = buildImportArray("refbase", // 'type' - the array format of the 'records' element
"1.0", // 'version' - the version of the given array structure
"http://refbase.net/import/arxiv/", // 'creator' - the name of the script/importer (preferably given as unique URI)
"Matthias Steffens", // 'author' - author/contact name of the person who's responsible for this script/importer
"refbase@extracts.de", // 'contact' - author's email/contact address
array('prefix_call_number' => "true"), // 'options' - array with settings that control the behaviour of the 'addRecords()' function
$parsedRecordsArray); // 'records' - array of record(s) (with each record being a sub-array of fields)
return array($importDataArray, $recordsCount, $importRecordNumbersRecognizedFormatArray, $importRecordNumbersNotRecognizedFormatArray, $errors);
}
// --------------------------------------------------------------------
// RIS TO REFBASE
// This function converts records from Reference Manager (RIS) format into the standard "refbase"
// array format which can be then imported by the 'addRecords()' function in 'include.inc.php'.
function risToRefbase($sourceText, $importRecordsRadio, $importRecordNumbersArray)
{
global $contentTypeCharset; // defined in 'ini.inc.php'
global $errors;
global $showSource;
// Define regular expression patterns that will facilitate parsing of RIS data:
// (patterns must be specified as perl-style regular expression, without the leading & trailing slashes, if not stated otherwise)
// Pattern by which the input text will be split into individual records:
$recordDelimiter = "\s*[\r\n]ER - *[\r\n]*\s*";
// Pattern by which records will be split into individual fields:
$fieldDelimiter = "[\r\n]+(?=\w\w - )";
// Pattern by which fields will be split into their field label (tag) and field data:
$dataDelimiter = "(?<=^\w\w) - ";
// Pattern by which multiple persons are separated within the author, editor or series editor fields of the source data:
// (Notes: - name standardization occurs after multiple author fields have been merged by '; '
// - the split pattern must be specified as perl-style regular expression (including the leading & trailing
// slashes) and may include mode modifiers (such as '/.../i' to perform a case insensitive match))
$personDelimiter = "/ *; */";
// Pattern by which a person's family name is separated from the given name (or initials):
// (the split pattern must be specified as perl-style regular expression (including the leading & trailing
// slashes) and may include mode modifiers (such as '/.../i' to perform a case insensitive match))
$familyNameGivenNameDelimiter = "/ *, */";
// Specifies whether the person's family name comes first within a person's name
// ('true' means that the family name is followed by the given name (or initials), 'false' means that the person's family name comes *after* the given name (or initials))
$familyNameFirst = true;
// Specifies whether a person's full given name(s) shall be shortened to initial(s):
// (Notes: - if set to 'true', given names will be abbreviated and initials will get normalized (meaning removal of extra whitespace, adding of dots between initials, etc)
// - if set to 'false', given names (and any initials) are taken as is
// - in your database, you should stick to either fully written given names OR initials; if you mix these, records won't get sorted correctly on citation output)
$shortenGivenNames = true;
// Specifies whether fields whose contents are entirely in upper case shall be transformed to title case ('true') or not ('false'):
$transformCase = true;
// Preprocessor actions:
// Defines search & replace 'actions' that will be applied to each record's raw source data if the pattern in the corresponding 'match' element is matched:
// (If you don't want to perform any preprocessor actions, specify an empty array, like: '$preprocessorActionsArray = array();'.
// Note that, in this case, the search patterns MUST include the leading & trailing slashes -- which is done to allow for mode modifiers such as 'imsxU'.)
// "/Search Pattern/" => "Replace Pattern"
$preprocessorActionsArray = array(
array(
'match' => "/&#?\w+;/", // if HTML encoded text (such as "&auml;", "&#xF6;" or "&#233;") occurs in the source data
'actions' => array(
"/(&#?\w+;)/e" => "html_entity_decode('\\1', ENT_QUOTES, '$contentTypeCharset')" // HTML decode source data (see <http://www.php.net/manual/en/function.html-entity-decode.php>)
)
)
);
// Postprocessor actions:
// Defines search & replace 'actions' that will be applied to all those refbase fields that are listed in the corresponding 'fields' element:
// (If you don't want to perform any search and replace actions, specify an empty array, like: '$postprocessorActionsArray = array();'.
// Note that, in this case, the search patterns MUST include the leading & trailing slashes -- which is done to allow for mode modifiers such as 'imsxU'.)
// "/Search Pattern/" => "Replace Pattern"
$postprocessorActionsArray = array(
array(
'fields' => array("year"),
'actions' => array(
"/^.*?(\d{4}).*/" => "\\1" // for the 'year' field, extract any four-digit number (and discard everything else)
)
),
array(
'fields' => array("title"),
'actions' => array(
"/[,.;:!] *$/" => "" // remove any punctuation (except for question marks) from end of field contents
)
),
array(
'fields' => array("notes"),
'actions' => array(
"/(; ?)?exported from refbase \(http[^ ]+ last updated.+?\d{2}:\d{2}:\d{2} [+-]\d{4}/" => "" // remove refbase attribution string (such as "exported from refbase (http://localhost/refs/show.php?record=12345), last updated on Sat, 15 Jul 2006 22:24:16 +0200")
)
),
array(
'fields' => array("url"),
'actions' => array(
"/^PM:(\d+)$/i" => "http://www.ncbi.nlm.nih.gov/pubmed/\\1" // convert "PM:17302433" into a resolvable PubMed URL; Bibutils 'xml2ris' (<= v3.40) converts "<identifier type="pubmed">17302433</identifier>" to "UR - PM:17302433"
)
),
array(
'fields' => array("doi"),
'actions' => array(
"#^(?:.*?)(?=10\.\d{4}/\S+?)|^(?!10\.\d{4}/\S+?).+$#" => "", // remove any text before the DOI (this also deletes the field's contents if it doesn't contain a DOI at all)
"#(10\.\d{4}/\S+?)(?=$|\s).*$#" => "\\1" // remove any text after the DOI
)
),
array(
'fields' => array("title", "address", "keywords", "abstract", "orig_title", "series_title", "abbrev_series_title", "notes"), // convert font attributes (which some publishers include in RIS records that are available on their web pages)
'actions' => array(
"/<sup>(.+?)<\/sup>/i" => "[super:\\1]", // replace '<sup>...</sup>' with refbase markup ('[super:...]')
"/<sub>(.+?)<\/sub>/i" => "[sub:\\1]", // replace '<sub>...</sub>' with refbase markup ('[sub:...]')
"/<i>(.+?)<\/i>/i" => "_\\1_", // replace '<i>...</i>' with refbase markup ('_..._')
"/<b>(.+?)<\/b>/i" => "**\\1**", // replace '<b>...</b>' with refbase markup ('**...**')
"/\\x10(.+?)\\x11/" => "_\\1_" // replace '<ASCII#10>...<ASCII#11>' (which is used by Reference Manager to indicate italic strings) with refbase markup ('_..._')
)
),
array(
'fields' => array("title", "abstract", "orig_title", "series_title", "abbrev_series_title", "notes"), // convert RefWorks font attributes (which RefWorks supports in title fields, notes, abstracts and user 1 - 5 fields)
'actions' => array(
"/0RW1S34RfeSDcfkexd09rT3(.+?)1RW1S34RfeSDcfkexd09rT3/" => "[super:\\1]", // replace RefWorks indicators for superscript text with refbase markup ('[super:...]')
"/0RW1S34RfeSDcfkexd09rT4(.+?)1RW1S34RfeSDcfkexd09rT4/" => "[sub:\\1]", // replace RefWorks indicators for subscript text with refbase markup ('[sub:...]')
"/0RW1S34RfeSDcfkexd09rT2(.+?)1RW1S34RfeSDcfkexd09rT2/" => "_\\1_", // replace RefWorks indicators for italic text with refbase markup ('_..._')
"/0RW1S34RfeSDcfkexd09rT0(.+?)1RW1S34RfeSDcfkexd09rT0/" => "**\\1**", // replace RefWorks indicators for bold text with refbase markup ('**...**')
"/0RW1S34RfeSDcfkexd09rT1(.+?)1RW1S34RfeSDcfkexd09rT1/" => "__\\1__" // replace RefWorks indicators for underline text with refbase markup ('__...__')
)
)
);
// This array lists patterns which match all RIS tags that must occur within a record to be recognized as valid RIS record:
// (Array keys must contain the tag name as it should be displayed to the user; as is the case with search & replace actions,
// the search patterns MUST include the leading & trailing slashes.)
// "tag display name" => "tag search pattern"
$requiredTagsArray = array(
"TY" => "/^TY - /m"
);
// This array matches RIS tags with their corresponding refbase fields:
// (fields that are unsupported in either RIS or refbase are commented out)
// "RIS tag" => "refbase field" // RIS tag name (comment)
$tagsToRefbaseFieldsArray = array(
"TY" => "type", // Type of reference (IMPORTANT: the array element that maps to 'type' must be listed as the first element!)
"AU" => "author", // Author Primary
"A1" => "author", // Author Primary
"A2" => array("CONF" => "conference", "CPAPER" => "conference", "Other" => "editor"), // Author Secondary (see note for 'corporate_author' below)
"ED" => "editor", // Author Secondary
"A3" => "series_editor", // Author Series
// "A4" => "", // Subsidary Authors
"AD" => "address", // Address
// "" => "corporate_author", // note that bibutils uses the RIS 'A2' tag to indicate conference titles ('<name type="conference">') and corporate authors ('<name type="corporate">', e.g., when importing contents of the BibTeX 'organization' field)
// "AN" => "", // Accession Number
// "CA" => "", // Caption
// "LB" => "", // Label
"TI" => "title", // Title Primary
"T1" => "title", // Title Primary
"CT" => "title", // Title Primary
// "" => "orig_title",
"PY" => "year", // RIS Spec 2009: Publication Year (four digits); RIS Spec 2004: Date Primary (date must be in the following format: "YYYY/MM/DD/other_info"; the year, month and day fields are all numeric; the other info field can be any string of letters, spaces and hyphens; note that each specific date information is optional, however the slashes ("/") are not)
"Y1" => "year", // Date Primary (same syntax rules as for "PY", RIS Spec 2004)
// "Y2" => "", // Date Secondary (same syntax rules as for "PY", RIS Spec 2004)
"DA" => "year", // RIS Spec 2009: Date (same syntax rules as for "PY", RIS Spec 2004)
"BT" => array("BOOK" => "series_title", "STD" => "series_title", "THES" => "series_title", "Other" => "publication"), // according to <http://www.refman.com/support/risformat_tags_01.asp> this would be: array("BOOK" => "title", "Other" => "publication"), // Book Whole: Title Primary; Other reference types: Title Secondary
"JF" => "publication", // Periodical name: full format
"JO" => "publication", // Periodical name: full format
"JA" => "abbrev_journal", // Periodical name: standard abbreviation
"J1" => "abbrev_journal", // Periodical name: user abbreviation 1
"J2" => "abbrev_journal", // Periodical name: user abbreviation 2
"T2" => array("JOUR" => "abbrev_journal", "Other" => "abbrev_series_title"), // Title Secondary (note that "T2" is used by bibutils (instead of "JA") for abbreviated journal names and abbreviated series titles)
"T3" => "series_title", // Title Series (in case of "TY=CONF", "T3" appears to be used for conference title)
"VL" => "volume", // Volume number
"CP" => "issue", // Issue
"IS" => "issue", // Issue
"SP" => "startPage", // Start page number (contents of the special fields 'startPage' and 'endPage' will be merged into a range and copied to the refbase 'pages' field)
"EP" => "endPage", // Ending page number
"LP" => "endPage", // Ending page number ('LP' is actually not part of the RIS specification but gets used in the wild such as in RIS exports of the American Physical Society, <http://aps.org/>)
// "" => "series_volume", // (for 'series_volume' and 'series_issue', some magic will be applied within the 'parseRecords()' function)
// "" => "series_issue",
"PB" => "publisher", // Publisher
"CY" => "place", // City of publication
// "DB" => "", // Database
// "DP" => "", // Database provider
"ET" => "edition",
// "" => "medium",
"SN" => array("BOOK" => "isbn", "CHAP" => "isbn", "STD" => "isbn", "THES" => "isbn", "Other" => "issn"), // Book Whole, Book Chapter, Generic and Thesis: ISBN; Other reference types: ISSN (note that this will fail for a thesis that was published within a series with an ISSN number)
"LA" => "language",
// "" => "summary_language",
"KW" => "keywords", // Keywords
"AB" => "abstract", // Notes (NOTE: by mapping 'AB' to "abstract" we deviate from the RIS spec which treats 'AB' as synonym to 'N1', i.e. notes)
"N2" => "abstract", // Abstract
// "" => "area",
// "" => "expedition",
// "" => "conference",
"DO" => "doi", // DOI (we also recognize a DOI when given in the M3 or UR fields; see below)
"UR" => "url", // URL (URL addresses can be entered individually, one per tag or multiple addresses can be entered on one line using a semi-colon as a separator)
"L1" => "file", // Link to PDF (same syntax rules as for "UR")
// "L2" => "", // Link to Full-text (same syntax rules as for "UR")
// "L3" => "related", // Related Records (this mapping would require some postprocessing of the field value so that it's suitable for the 'related' field)
// "L4" => "", // Image(s)
"N1" => "notes", // Notes
"ID" => "call_number", // Reference ID (NOTE: if no other field gets mapped to the 'cite_key' field, the contents of the 'call_number' field will be also copied to the 'cite_key' field of the currently logged-in user)
// "M1" => "", // Miscellaneous 1
// "M2" => "", // Miscellaneous 2
"M3" => "doi", // Miscellaneous 3 (ISI Web of Science exports the DOI number in the M3 field) but TR specified it should be Type of Work.
"U1" => "thesis", // User definable 1 ('U1' is used by Bibutils to indicate the type of thesis, e.g. "Masters thesis" or "Ph.D. thesis"; function 'parseRecords()' will further tweak the contents of the refbase 'thesis' field)
"U2" => "user_notes", // User definable 2
// "U3" => "", // User definable 3
// "U4" => "", // User definable 4
// "U5" => "", // User definable 5
// "" => "contribution_id",
// "" => "online_publication",
// "" => "online_citation",
// "" => "approved",
// "" => "orig_record",
// "RP" => "copy", // Reprint status (valid values: "IN FILE", "NOT IN FILE", "ON REQUEST (MM/DD/YY)") (this mapping would require some postprocessing of the field value so that it's suitable for the 'copy' field)
// "AV" => "", // Availability
// "C1" => "", // Custom 1
// "C2" => "", // Custom 2
// "C3" => "", // Custom 3
// "C4" => "", // Custom 4
// "C5" => "", // Custom 5
// "C6" => "", // Custom 6
// "C7" => "", // Custom 7
// "C8" => "", // Custom 8
);
// This array lists all RIS tags that may occur multiple times:
$tagsMultipleArray = array(
"AU",
"A1",
"A2",
"ED",
"A3",
"KW",
"UR", // currently, refbase does only support one URL per record (however, we allow 'UR' to occur multiple times to extract any DOI given as URL, otherwise only the first URL will be taken)
// "L1", // currently, refbase does only support one file per record
"N1"
);
// This array matches RIS reference types with their corresponding refbase types:
// (RIS types that are currently not supported in refbase will be taken as is but will get
// prefixed with an "Unsupported: " label; '#fallback#' in comments indicates a type mapping that
// is not a perfect match but as close as currently possible)
// "RIS type" => "refbase type" // name of RIS reference type (comment)
$referenceTypesToRefbaseTypesArray = array(
"ABST" => "Abstract", // Abstract
"ADVS" => "Unsupported: Audiovisual Material", // Audiovisual material
"AGGR" => "Unsupported: Aggregated Database", // Aggregated database
"ANCIENT" => "Unsupported: Ancient Text", // Ancient text
"ART" => "Unsupported: Art Work", // Art work
"BILL" => "Unsupported: Bill/Resolution", // Bill/Resolution
"BLOG" => "Unsupported: Blog", // Blog
"BOOK" => "Book Whole", // Book, Whole
"CASE" => "Unsupported: Case", // Case
"CHAP(TER)?" => "Book Chapter", // Book chapter (the incorrect CHAPTER type gets used by SpringerLink, see e.g. RIS output at <http://www.springerlink.com/content/57w5dd51eh0h8a25>)
"CHART" => "Unsupported: Chart", // Chart
"CLSWK" => "Unsupported: Classical Work", // Classical work
"COMP" => "Software", // Computer program
"CONF" => "Conference Article", // Conference proceeding
"CPAPER" => "Conference Article", // Conference paper
"CTLG" => "Book Whole", // Catalog (#fallback#)
"DATA" => "Unsupported: Data File", // Data file
"DBASE" => "Unsupported: Online Database", // Online database
"DICT" => "Book Whole", // Dictionary (#fallback#)
"EBOOK" => "Book Whole", // Electronic book
"ECHAP" => "Book Chapter", // Electronic book section
"EDBOOK" => "Book Whole", // Edited book
"EJOUR" => "Journal Article", // Electronic article
"ELEC" => "Unsupported: Electronic Citation", // Electronic Citation
"ENCYC" => "Book Whole", // Encyclopedia (#fallback#)
"EQUA" => "Unsupported: Equation", // Equation
"FIGURE" => "Unsupported: Figure", // Firure
"GEN" => "Miscellaneous", // Generic
"GOVDOC" => "Unsupported: Government Document", // Government document
"GRNT" => "Unsupported: Grant", // Grant
"HEAR" => "Unsupported: Hearing", // Hearing
"ICOMM" => "Unsupported: Internet Communication", // Internet Communication
"INPR" => "Journal Article", // In Press (#fallback#)
"JFULL" => "Journal", // Journal (full)
"JOUR" => "Journal Article", // Journal
"LEGAL" => "Unsupported: Legal Rule", // Legal rule
"MAP" => "Map", // Map
"MANSCPT" => "Manuscript", // Manuscript
"MGZN" => "Magazine Article", // Magazine article
"MPCT" => "Unsupported: Motion Picture", // Motion picture
"MULTI" => "Unsupported: Multimedia", // Multimedia
"MUSIC" => "Unsupported: Music Score", // Music score
"NEWS" => "Newspaper Article", // Newspaper
"PAMP" => "Unsupported: Pamphlet", // Pamphlet
"PAT" => "Patent", // Patent
"PCOMM" => "Unsupported: Personal Communication", // Personal communication
"RPRT" => "Report", // Report
"SER" => "Unsupported: Serial (Book, Monograph)", // Serial (Book, Monograph)
"SLIDE" => "Unsupported: Slide", // Slide
"SOUND" => "Unsupported: Sound Recording", // Sound recording
"STAND" => "Miscellaneous", // Standard (#fallback#) due to STD handling
"STAT" => "Unsupported: Statute", // Statute
"STD" => "Miscellaneous", // Generic (note that 'STD' is used by bibutils although it is NOT listed as a recognized reference type at <http://www.refman.com/support/risformat_reftypes.asp>)
"THES" => "Thesis", // Thesis/Dissertation (function 'parseRecords()' will set the special type 'Thesis' back to 'Book Whole' and adopt the refbase 'thesis' field)
"UNBILL" => "Unsupported: Unenacted Bill/Resolution", // Unenacted bill/resolution
"UNPB" => "Manuscript", // Unpublished work (#fallback#)
"VIDEO" => "Unsupported: Video Recording" // Video recording
);
// -----------------------------------------
// Split input text into individual records:
$recordArray = splitSourceText($sourceText, $recordDelimiter, false); // split on the "ER" (= end of record) tag that terminates every RIS record
// Validate all records that shall be imported:
list($errors, $importRecordNumbersRecognizedFormatArray, $importRecordNumbersNotRecognizedFormatArray) = validateRecords($recordArray, $requiredTagsArray, $importRecordsRadio, $importRecordNumbersArray, $errors);
// Parse all records that shall be imported:
list($parsedRecordsArray, $recordsCount) = parseRecords($recordArray, "RIS", $importRecordNumbersRecognizedFormatArray, $tagsToRefbaseFieldsArray, $tagsMultipleArray, $referenceTypesToRefbaseTypesArray, $fieldDelimiter, $dataDelimiter, $personDelimiter, $familyNameGivenNameDelimiter, $familyNameFirst, $shortenGivenNames, $transformCase, $postprocessorActionsArray, $preprocessorActionsArray);
// Build refbase import array:
$importDataArray = buildImportArray("refbase", // 'type' - the array format of the 'records' element
"1.0", // 'version' - the version of the given array structure
"http://refbase.net/import/ris/", // 'creator' - the name of the script/importer (preferably given as unique URI)
"Matthias Steffens", // 'author' - author/contact name of the person who's responsible for this script/importer
"refbase@extracts.de", // 'contact' - author's email/contact address
array('prefix_call_number' => "true"), // 'options' - array with settings that control the behaviour of the 'addRecords()' function
$parsedRecordsArray); // 'records' - array of record(s) (with each record being a sub-array of fields)
return array($importDataArray, $recordsCount, $importRecordNumbersRecognizedFormatArray, $importRecordNumbersNotRecognizedFormatArray, $errors);
}
// --------------------------------------------------------------------
// ENDNOTE TO REFBASE
// This function converts records from Endnote tagged (Endnote Refer) format into the standard "refbase"
// array format which can be then imported by the 'addRecords()' function in 'include.inc.php'.
function endnoteToRefbase($sourceText, $importRecordsRadio, $importRecordNumbersArray)
{
global $contentTypeCharset; // defined in 'ini.inc.php'
global $errors;
global $showSource;
// Define regular expression patterns that will facilitate parsing of Endnote Refer data:
// (patterns must be specified as perl-style regular expression, without the leading & trailing slashes, if not stated otherwise)
// Pattern by which the input text will be split into individual records:
$recordDelimiter = "\s*(\r\n|\r(?!\n)|(?<!\r)\n){2,}\s*(?=%\S )";
// Pattern by which records will be split into individual fields:
$fieldDelimiter = "(\r\n|\r(?!\n)|(?<!\r)\n)+(?=%\S )";
// Pattern by which fields will be split into their field label (tag) and field data:
$dataDelimiter = "(?<=^%\S) ";
// Pattern by which multiple persons are separated within the author, editor or series editor fields of the source data:
// (Notes: - name standardization occurs after multiple author fields have been merged by '; '
// - the split pattern must be specified as perl-style regular expression (including the leading & trailing
// slashes) and may include mode modifiers (such as '/.../i' to perform a case insensitive match))
$personDelimiter = "/ *; */";
// Pattern by which a person's family name is separated from the given name (or initials):
// (the split pattern must be specified as perl-style regular expression (including the leading & trailing
// slashes) and may include mode modifiers (such as '/.../i' to perform a case insensitive match))
$familyNameGivenNameDelimiter = "/ *, */";
// Specifies whether the person's family name comes first within a person's name
// ('true' means that the family name is followed by the given name (or initials), 'false' means that the person's family name comes *after* the given name (or initials))
$familyNameFirst = true;
// Specifies whether a person's full given name(s) shall be shortened to initial(s):
// (Notes: - if set to 'true', given names will be abbreviated and initials will get normalized (meaning removal of extra whitespace, adding of dots between initials, etc)
// - if set to 'false', given names (and any initials) are taken as is
// - in your database, you should stick to either fully written given names OR initials; if you mix these, records won't get sorted correctly on citation output)
$shortenGivenNames = true;
// Specifies whether fields whose contents are entirely in upper case shall be transformed to title case ('true') or not ('false'):
$transformCase = true;
// Preprocessor actions:
// Defines search & replace 'actions' that will be applied to each record's raw source data if the pattern in the corresponding 'match' element is matched:
// (If you don't want to perform any preprocessor actions, specify an empty array, like: '$preprocessorActionsArray = array();'.
// Note that, in this case, the search patterns MUST include the leading & trailing slashes -- which is done to allow for mode modifiers such as 'imsxU'.)
// "/Search Pattern/" => "Replace Pattern"
$preprocessorActionsArray = array(
array(
'match' => "/&#?\w+;/", // if HTML encoded text (such as "&auml;", "&#xF6;" or "&#233;") occurs in the source data
'actions' => array(
"/(&#?\w+;)/e" => "html_entity_decode('\\1', ENT_QUOTES, '$contentTypeCharset')" // HTML decode source data (see <http://www.php.net/manual/en/function.html-entity-decode.php>)
)
)
);
// Postprocessor actions:
// Defines search & replace 'actions' that will be applied to all those refbase fields that are listed in the corresponding 'fields' element:
// (If you don't want to perform any search and replace actions, specify an empty array, like: '$postprocessorActionsArray = array();'.
// Note that, in this case, the search patterns MUST include the leading & trailing slashes -- which is done to allow for mode modifiers such as 'imsxU'.)
// "/Search Pattern/" => "Replace Pattern"
$postprocessorActionsArray = array(
array(
'fields' => array("year"),
'actions' => array(
"/^.*?(\d{4}).*/" => "\\1" // for the 'year' field, extract any four-digit number (and discard everything else)
)
),
array(
'fields' => array("title"),
'actions' => array(
"/[,.;:!] *$/" => "" // remove any punctuation (except for question marks) from end of field contents
)
),
array(
'fields' => array("notes"),
'actions' => array(
"/(; ?)?exported from refbase \(http[^ ]+ last updated.+?\d{2}:\d{2}:\d{2} [+-]\d{4}/" => "" // remove refbase attribution string (such as "exported from refbase (http://localhost/refs/show.php?record=12345), last updated on Sat, 15 Jul 2006 22:24:16 +0200")
)
),
array(
'fields' => array("url"),
'actions' => array(
"/^PM:(\d+)$/i" => "http://www.ncbi.nlm.nih.gov/pubmed/\\1" // convert "PM:17302433" into a resolvable PubMed URL; Bibutils 'xml2ris' (<= v3.40) converts "<identifier type="pubmed">17302433</identifier>" to "UR - PM:17302433"
)
),
array(
'fields' => array("doi"),
'actions' => array(
"#^(?:.*?)(?=10\.\d{4}/\S+?)|^(?!10\.\d{4}/\S+?).+$#" => "", // remove any text before the DOI (this also deletes the field's contents if it doesn't contain a DOI at all)
"#(10\.\d{4}/\S+?)(?=$|\s).*$#" => "\\1" // remove any text after the DOI
)
),
array(
'fields' => array("title", "address", "keywords", "abstract", "orig_title", "series_title", "abbrev_series_title", "notes"), // convert font attributes (which publishers might have included in Endnote Refer records that are available on their web pages)
'actions' => array(
"/<sup>(.+?)<\/sup>/i" => "[super:\\1]", // replace '<sup>...</sup>' with refbase markup ('[super:...]')
"/<sub>(.+?)<\/sub>/i" => "[sub:\\1]", // replace '<sub>...</sub>' with refbase markup ('[sub:...]')
"/<i>(.+?)<\/i>/i" => "_\\1_", // replace '<i>...</i>' with refbase markup ('_..._')
"/<b>(.+?)<\/b>/i" => "**\\1**", // replace '<b>...</b>' with refbase markup ('**...**')
"/\\x10(.+?)\\x11/" => "_\\1_" // replace '<ASCII#10>...<ASCII#11>' (which is used by Reference Manager to indicate italic strings) with refbase markup ('_..._')
)
),
array(
'fields' => array("title", "abstract", "orig_title", "series_title", "abbrev_series_title", "notes"), // convert RefWorks font attributes (which RefWorks supports in title fields, notes, abstracts and user 1 - 5 fields)
'actions' => array(
"/0RW1S34RfeSDcfkexd09rT3(.+?)1RW1S34RfeSDcfkexd09rT3/" => "[super:\\1]", // replace RefWorks indicators for superscript text with refbase markup ('[super:...]')
"/0RW1S34RfeSDcfkexd09rT4(.+?)1RW1S34RfeSDcfkexd09rT4/" => "[sub:\\1]", // replace RefWorks indicators for subscript text with refbase markup ('[sub:...]')
"/0RW1S34RfeSDcfkexd09rT2(.+?)1RW1S34RfeSDcfkexd09rT2/" => "_\\1_", // replace RefWorks indicators for italic text with refbase markup ('_..._')
"/0RW1S34RfeSDcfkexd09rT0(.+?)1RW1S34RfeSDcfkexd09rT0/" => "**\\1**", // replace RefWorks indicators for bold text with refbase markup ('**...**')
"/0RW1S34RfeSDcfkexd09rT1(.+?)1RW1S34RfeSDcfkexd09rT1/" => "__\\1__" // replace RefWorks indicators for underline text with refbase markup ('__...__')
)
)
);
// This array lists patterns which match all Endnote Refer tags that must occur within a record to be recognized as valid Endnote Refer record:
// (Array keys must contain the tag name as it should be displayed to the user; as is the case with search & replace actions,
// the search patterns MUST include the leading & trailing slashes.)
// "tag display name" => "tag search pattern"
$requiredTagsArray = array(
"%0" => "/^%0 /m"
);
// This array matches Endnote Refer tags with their corresponding refbase fields:
// (fields that are unsupported in either Endnote Refer or refbase are commented out)
// "Endnote Refer tag" => "refbase field" // Endnote Refer tag name (comment)
$tagsToRefbaseFieldsArray = array(
"%0" => "type", // Reference Type (IMPORTANT: the array element that maps to 'type' must be listed as the first element!)
"%A" => "author", // Author (Primary Author)
"%E" => "editor", // Editor (Secondary Author)
"%Y" => "series_editor", // Tertiary Author (Series Author)
"%+" => "address", // Address
// "" => "corporate_author", // Corporate Author (in the original Refer format '%Q' was referring to the corporate author)
"%T" => "title", // Title (Primary Title)
// "" => "orig_title",
"%D" => "year", // Year (Primary Date; the year should be specified in full, and the month name rather than number should be used)
"%B" => array("Journal Article" => "publication", "Book Section" => "publication", "Conference Proceedings" => "publication", "Magazine Article" => "publication", "Newspaper Article" => "publication", "Other" => "series_title"), // Secondary Title (of a Book or Conference Name); refbase "in-container" types: Secondary Title; Other reference types: Tertiary Title (Series Title)
"%J" => "publication", // Secondary Title (Journal Name / Periodical Name)
// "" => "abbrev_journal", // Periodical Name: Standard Abbreviation (Endnote Refer doesn't seem to distinguish between full format & abbreviated formats)
"%S" => "series_title", // Tertiary Title (Series Title)
"%V" => "volume", // Volume
"%N" => "issue", // Number (Issue)
"%P" => "pages", // Pages
// "" => "series_volume", // (for 'series_volume' and 'series_issue', some magic will be applied within the 'parseRecords()' function)
// "" => "series_issue",
"%I" => "publisher", // Publisher
"%C" => "place", // Place Published
"%7" => "edition", // Edition
// "" => "medium",
"%@" => array("Book" => "isbn", "Book Section" => "isbn", "Edited Book" => "isbn", "Classical Work" => "isbn", "Generic" => "isbn", "Thesis" => "isbn", "Other" => "issn"), // ISBN/ISSN; Book Whole, Book Chapter, Generic and Thesis: ISBN; Other reference types: ISSN (note that this will fail for a thesis that was published within a series with an ISSN number)
"%G" => "language", // Language
// "" => "summary_language",
"%K" => "keywords", // Keywords
"%X" => "abstract", // Abstract
// "" => "area",
// "" => "expedition",
// "" => "conference", // see '%8' below
// "" => "doi", // DOI
"%U" => "url", // URL
"%>" => "file", // Link to PDF
// "" => "related", // Related Records (this mapping would require some postprocessing of the field value so that it's suitable for the 'related' field)
"%O" => "notes", // Notes (Bibutils uses '%O' for notes instead!)
"%Z" => "notes", // Notes
"%F" => "cite_key", // Label (Reference ID)
"%9" => "thesis", // Type of Work (how the entry was published; for reports, this would be the report type, and for theses, the thesis type (e.g. "Masters thesis" or "Ph.D. thesis"); function 'parseRecords()' will further tweak the contents of the refbase 'thesis' field; the original Refer format seems to use '%R' for report, paper, or thesis type)
// "%H" => "", // Translated Author (in the original Refer format '%H' was referring to the "Header commentary which is printed before the reference")
"%L" => "call_number", // Call Number (NOTE: if no other field gets mapped to the 'cite_key' field, the contents of the 'call_number' field will be also copied to the 'cite_key' field of the currently logged-in user)
"%8" => array("Conference Proceedings" => "conference", "Other" => "notes"), // Date (date associated with entry; for conference proceedings, this would be the date of the conference)
// "" => "contribution_id",
// "" => "online_publication",
// "" => "online_citation",
// "" => "approved",
// "" => "orig_record",
// "" => "copy", // Reprint status (this mapping would require some postprocessing of the field value so that it's suitable for the 'copy' field)
// "%M" => "", // Accession Number
// "%Q" => "", // Translated Title (in the original Refer format '%Q' was referring to the corporate author)
// "%R" => "", // Electronic Resource Number
// "%W" => "", // Database Provider
// "%1" => "", // Custom 1
// "%2" => "", // Custom 2
// "%3" => "", // Custom 3
// "%4" => "", // Custom 4
// "%6" => "", // Number of Volumes
// "%?" => "", // Subsidiary Author
// "%!" => "", // Short Title
// "%#" => "", // Custom 5
// "%$" => "", // Custom 6
// "%]" => "", // Custom 7
// "%&" => "", // Section
// "%(" => "", // Original Publication (date)
// "%)" => "", // Reprint Edition (date)
// "%*" => "", // Reviewed Item
// "%^" => "", // Caption
// "%<" => "", // Research Notes
// "%[" => "", // Access Date
// "%=" => "", // Last Modified Date
// "%~" => "", // Name of Database
);
// This array lists all Endnote Refer tags that may occur multiple times:
$tagsMultipleArray = array(
"%A",
"%E",
"%I",
"%K",
"%O",
"%Y",
"%Z",
"%@"
);
// This array matches Endnote Refer reference types with their corresponding refbase types:
// (Endnote Refer types that are currently not supported in refbase will be taken as is but will get
// prefixed with an "Unsupported: " label; '#fallback#' in comments indicates a type mapping that
// is not a perfect match but as close as currently possible)
// "Endnote Refer type" => "refbase type" // comment
$referenceTypesToRefbaseTypesArray = array(
// "" => "Abstract",
"Aggregated Database" => "Unsupported: Aggregated Database", // EN X2
"Ancient Text" => "Unsupported: Ancient Text", // EN X
"Artwork" => "Unsupported: Artwork",
"Audiovisual Material" => "Unsupported: Audiovisual Material",
"Bill" => "Unsupported: Bill",
"Blog" => "Unsupported: Blog", // EN X2
"^Book$" => "Book Whole", // without the Regex anchors "Book Section" would get renamed incorrectly as "Book Whole Section"
"Book Section" => "Book Chapter",
"Case" => "Unsupported: Case",
"Catalog" => "Unsupported: Catalog", // EN X2
"Chart or Table" => "Unsupported: Chart or Table",
"Classical Work" => "Book Whole", // #fallback# // EN 8 (Classical Works)
"Computer Program" => "Software",
"Conference Paper" => "Conference Article", // EN 8
"Conference Proceedings" => "Conference Volume",
"Dictionary" => "Unsupported: Dictionary", // EN X
"Edited Book" => "Book Whole",
"Electronic Article" => "Journal Article", // #fallback# // EN 8 (Electronic Journal); renamed in EN 9 (was: Electronic Journal)
"Electronic Book" => "Book Whole", // #fallback# // EN 8
"Encyclopedia" => "Unsupported: Encyclopedia", // EN X
"Equation" => "Unsupported: Equation",
"Figure" => "Unsupported: Figure",
"Film or Broadcast" => "Unsupported: Film or Broadcast",
"Generic" => "Miscellaneous",
"Government Document" => "Report", // #fallback# // EN 8 (Government Report or Document)
"Grant" => "Unsupported: Grant", // EN X
"Hearing" => "Unsupported: Hearing",
"Journal Article" => "Journal Article",
"Legal Rule or Regulation" => "Unsupported: Legal Rule or Regulation", // EN 8 (Legal Rule/Regulation)
"Magazine Article" => "Magazine Article",
"Manuscript" => "Manuscript",
"Map" => "Map",
"Newspaper Article" => "Newspaper Article",
"Online Database" => "Unsupported: Online Database", // EN 8
"Online Multimedia" => "Unsupported: Online Multimedia", // EN 8
"Pamphlet" => "Unsupported: Pamphlet", // EN X2
"Patent" => "Patent",
"Personal Communication" => "Unsupported: Personal Communication",
"Report" => "Report",
"Serial" => "Unsupported: Serial", // EN X2
"Standard" => "Unsupported: Standard", // EN X2
"Statute" => "Unsupported: Statute",
"Thesis" => "Thesis", // function 'parseRecords()' will set the special type 'Thesis' back to 'Book Whole' and adopt the refbase 'thesis' field
"Unpublished Work" => "Manuscript", // #fallback# // EN 8
"Web Page" => "Unsupported: Web Page", // renamed in EN X (was: Electronic Source)
"Unused 1" => "Unsupported: Unused 1",
"Unused 2" => "Unsupported: Unused 2",
"Unused 3" => "Unsupported: Unused 3"
);
// -----------------------------------------
// Split input text into individual records:
$recordArray = splitSourceText($sourceText, $recordDelimiter, false); // split on the blank line that delimites Endnote Refer records
// Validate all records that shall be imported:
list($errors, $importRecordNumbersRecognizedFormatArray, $importRecordNumbersNotRecognizedFormatArray) = validateRecords($recordArray, $requiredTagsArray, $importRecordsRadio, $importRecordNumbersArray, $errors);
// Parse all records that shall be imported:
list($parsedRecordsArray, $recordsCount) = parseRecords($recordArray, "Endnote", $importRecordNumbersRecognizedFormatArray, $tagsToRefbaseFieldsArray, $tagsMultipleArray, $referenceTypesToRefbaseTypesArray, $fieldDelimiter, $dataDelimiter, $personDelimiter, $familyNameGivenNameDelimiter, $familyNameFirst, $shortenGivenNames, $transformCase, $postprocessorActionsArray, $preprocessorActionsArray);
// Build refbase import array:
$importDataArray = buildImportArray("refbase", // 'type' - the array format of the 'records' element
"1.0", // 'version' - the version of the given array structure
"http://refbase.net/import/endnote-refer/", // 'creator' - the name of the script/importer (preferably given as unique URI)
"Matthias Steffens", // 'author' - author/contact name of the person who's responsible for this script/importer
"refbase@extracts.de", // 'contact' - author's email/contact address
array('prefix_call_number' => "true"), // 'options' - array with settings that control the behaviour of the 'addRecords()' function
$parsedRecordsArray); // 'records' - array of record(s) (with each record being a sub-array of fields)
return array($importDataArray, $recordsCount, $importRecordNumbersRecognizedFormatArray, $importRecordNumbersNotRecognizedFormatArray, $errors);
}
// --------------------------------------------------------------------
// MEDLINE TO REFBASE
// This function converts records from Pubmed MEDLINE format into the standard "refbase"
// array format which can be then imported by the 'addRecords()' function in 'include.inc.php'.
function medlineToRefbase($sourceText, $importRecordsRadio, $importRecordNumbersArray)
{
global $alnum, $alpha, $cntrl, $dash, $digit, $graph, $lower, $print, $punct, $space, $upper, $word, $patternModifiers; // defined in 'transtab_unicode_charset.inc.php' and 'transtab_latin1_charset.inc.php'
global $errors;
global $showSource;
// Define regular expression patterns that will facilitate parsing of MEDLINE data:
// (patterns must be specified as perl-style regular expression, without the leading & trailing slashes, if not stated otherwise)
// Pattern by which the input text will be split into individual records:
$recordDelimiter = "\s*[\r\n](?=PMID- |<html>)"; // PubMed error messages are wrapped into HTML (errors may occur e.g. when fetching MEDLINE data directly via their PubMed ID)
// Pattern by which records will be split into individual fields:
$fieldDelimiter = "[\r\n]+(?=\w{2,4} *- )";
// Pattern by which fields will be split into their field label (tag) and field data:
$dataDelimiter = "(?<=^\w{2}) - |(?<=^\w{3}) - |(?<=^\w{4})- ";
// Pattern by which multiple persons are separated within the author, editor or series editor fields of the source data:
// (Notes: - name standardization occurs after multiple author fields have been merged by '; '
// - the split pattern must be specified as perl-style regular expression (including the leading & trailing
// slashes) and may include mode modifiers (such as '/.../i' to perform a case insensitive match))
$personDelimiter = "/ *; */";
// Pattern by which a person's family name is separated from the given name (or initials):
// (the split pattern must be specified as perl-style regular expression (including the leading & trailing
// slashes) and may include mode modifiers (such as '/.../i' to perform a case insensitive match))
$familyNameGivenNameDelimiter = "/ *, */";
// Specifies whether the person's family name comes first within a person's name
// ('true' means that the family name is followed by the given name (or initials), 'false' means that the person's family name comes *after* the given name (or initials))
$familyNameFirst = true;
// Specifies whether a person's full given name(s) shall be shortened to initial(s):
// (Notes: - if set to 'true', given names will be abbreviated and initials will get normalized (meaning removal of extra whitespace, adding of dots between initials, etc)
// - if set to 'false', given names (and any initials) are taken as is
// - in your database, you should stick to either fully written given names OR initials; if you mix these, records won't get sorted correctly on citation output)
$shortenGivenNames = true;
// Specifies whether fields whose contents are entirely in upper case shall be transformed to title case ('true') or not ('false'):
$transformCase = true;
// Preprocessor actions:
// Defines search & replace 'actions' that will be applied to each record's raw source data if the pattern in the corresponding 'match' element is matched:
// (If you don't want to perform any preprocessor actions, specify an empty array, like: '$preprocessorActionsArray = array();'.
// Note that, in this case, the search patterns MUST include the leading & trailing slashes -- which is done to allow for mode modifiers such as 'imsxU'.)
// "/Search Pattern/" => "Replace Pattern"
$preprocessorActionsArray = array(
array(
'match' => "/^FAU - .+?[\r\n]AU - /m", // if author info is available via both 'FAU' *AND* 'AU' field(s)
'actions' => array(
"/^AU - .+?[\r\n]+/m" => "" // discard any 'AU' field(s) (which otherwise would confuse the 'parseRecords()' function)
)
),
array(
'match' => "/^AU - /m",
'actions' => array(
"/(?<=^AU - )([$alpha -]+) +([$upper]+)/m$patternModifiers" => "\\1, \\2" // change the string formatting in 'AU' field(s) to the one used by refbase (i.e. insert a comma between family name & initials)
)
)
);
// Postprocessor actions:
// Defines search & replace 'actions' that will be applied to all those refbase fields that are listed in the corresponding 'fields' element:
// (If you don't want to perform any search and replace actions, specify an empty array, like: '$postprocessorActionsArray = array();'.
// Note that, in this case, the search patterns MUST include the leading & trailing slashes -- which is done to allow for mode modifiers such as 'imsxU'.)
// "/Search Pattern/" => "Replace Pattern"
$postprocessorActionsArray = array(
array(
'fields' => array("year"),
'actions' => array(
"/^.*?(\d{4}).*/" => "\\1" // for the 'year' field, extract any four-digit number (and discard everything else)
)
),
array(
'fields' => array("title", "orig_title", "publication", "address"),
'actions' => array(
"/[,.;:!] *$/" => "" // remove any punctuation (except for question marks) from end of field contents
)
),
array(
'fields' => array("publication", "abbrev_journal"), // NOTE: this replacement action will probably be only beneficial for records of type "Journal Article" (if possible, this should rather be a preprocessor action to distinguish articles from books or other resource types)
'actions' => array(
"/\b([$lower])([$alpha]{3,})/e$patternModifiers" => "strtoupper('\\1').'\\2'" // make sure that all journal title words (with >3 characters) start with an upper case letter (the 'e' modifier allows to execute PHP code within the replacement pattern)
)
),
array(
'fields' => array("issn"),
'actions' => array(
"/^.*?(\w{4}-?\w{4}).*/" => "\\1" // remove any text except the actual ISSN number
)
),
array(
'fields' => array("notes"),
'actions' => array(
"/^(\d+)/" => "PMID:\\1", // insert a "PMID:" prefix in front of any number that's at the beginning of the notes field
"/Pmc(\d+)/" => "PMCID:PMC\\1" // insert a "PMCID:" prefix in front of any number that is prefixed with "Pmc"
)
),
array(
'fields' => array("doi"),
'actions' => array(
"/^.*?([^ ]+) *\[doi\].*/" => "\\1", // if a DOI number is given, extract the DOI and discard everything else
"/^.*?\[[^]]+\].*/" => "" // if no DOI number was given but some other ID info is still present, remove everything from the field
)
),
array(
'fields' => array("language", "summary_language"),
'actions' => array(
"/^afr$/i" => "Afrikaans", // map abbreviated language names to full names (taken from <http://www.nlm.nih.gov/bsd/language_table.html>)
"/^alb$/i" => "Albanian",
"/^amh$/i" => "Amharic",
"/^ara$/i" => "Arabic",
"/^arm$/i" => "Armenian",
"/^aze$/i" => "Azerbaijani",
"/^ben$/i" => "Bengali",
"/^bos$/i" => "Bosnian",
"/^bul$/i" => "Bulgarian",
"/^cat$/i" => "Catalan",
"/^chi$/i" => "Chinese",
"/^cze$/i" => "Czech",
"/^dan$/i" => "Danish",
"/^dut$/i" => "Dutch",
"/^eng$/i" => "English",
"/^epo$/i" => "Esperanto",
"/^est$/i" => "Estonian",
"/^fin$/i" => "Finnish",
"/^fre$/i" => "French",
"/^geo$/i" => "Georgian",
"/^ger$/i" => "German",
"/^gla$/i" => "Scottish Gaelic",
"/^gre$/i" => "Greek, Modern",
"/^heb$/i" => "Hebrew",
"/^hin$/i" => "Hindi",
"/^hun$/i" => "Hungarian",
"/^ice$/i" => "Icelandic",
"/^ind$/i" => "Indonesian",
"/^ita$/i" => "Italian",
"/^jpn$/i" => "Japanese",
"/^kin$/i" => "Kinyarwanda",
"/^kor$/i" => "Korean",
"/^lat$/i" => "Latin",
"/^lav$/i" => "Latvian",
"/^lit$/i" => "Lithuanian",
"/^mac$/i" => "Macedonian",
"/^mal$/i" => "Malayalam",
"/^mao$/i" => "Maori",
"/^may$/i" => "Malay",
"/^mul$/i" => "Multiple languages",
"/^nor$/i" => "Norwegian",
"/^per$/i" => "Persian",
"/^pol$/i" => "Polish",
"/^por$/i" => "Portuguese",
"/^pus$/i" => "Pushto",
"/^rum$/i" => "Romanian, Rumanian",
"/^rus$/i" => "Russian",
"/^san$/i" => "Sanskrit",
"/^scc$/i" => "Serbian",
"/^scr$/i" => "Croatian",
"/^slo$/i" => "Slovak",
"/^slv$/i" => "Slovenian",
"/^spa$/i" => "Spanish",
"/^swe$/i" => "Swedish",
"/^tha$/i" => "Thai",
"/^tur$/i" => "Turkish",
"/^ukr$/i" => "Ukrainian",
"/^und$/i" => "Undetermined",
"/^urd$/i" => "Urdu",
"/^vie$/i" => "Vietnamese",
"/^wel$/i" => "Welsh"
)
)
);
// This array lists patterns which match all MEDLINE tags that must occur within a record to be recognized as valid MEDLINE record:
// (Array keys must contain the tag name as it should be displayed to the user; as is the case with search & replace actions,
// the search patterns MUST include the leading & trailing slashes.)
// "tag display name" => "tag search pattern"
$requiredTagsArray = array(
"PMID" => "/^PMID- /m",
"PT" => "/^PT - /m"
);
// This array matches MEDLINE tags with their corresponding refbase fields:
// (MEDLINE fields taken from <http://www.ncbi.nlm.nih.gov/books/bv.fcgi?rid=helppubmed.table.pubmedhelp.T43>;
// fields that are unsupported in either MEDLINE or refbase are commented out)
// "MEDLINE tag" => "refbase field" // MEDLINE tag name [Description] (comment)
$tagsToRefbaseFieldsArray = array(
"PT" => "type", // Publication Type [The type of material the article represents] (IMPORTANT: the array element that maps to 'type' must be listed as the first element!)
"AU" => "author", // Author [Authors] (the contents of the 'AU' field will be used if the 'FAU' field is not available; note that for records that contain both 'AU' *AND* 'FAU' fields, this only works if a suitable preprocessor action is defined, see above)
"FAU" => "author", // Full Author Name [Full Author Names] (by default, we use this author format since family name and initials are uniquely separated by a comma)
// "" => "editor",
// "" => "series_editor",
"AD" => "address", // Affiliation [Institutional affiliation and address of the first author]
"CN" => "corporate_author", // Corporate Author [Corporate author or group names with authorship responsibility]
"TI" => "title", // Title [The title of the article]
"TT" => "orig_title", // Transliterated Title [Title of the article originally published in a non-English language, in that language]
"DP" => "year", // Publication Date [The date the article was published]
// "DEP" => "", // Date of Electronic Publication [Electronic publication date]
"JT" => "publication", // Full Journal Title [Full journal title from NLM's cataloging data]
"TA" => "abbrev_journal", // Journal Title Abbreviation [Standard journal title abbreviation]
// "" => "series_title",
"VI" => "volume", // Volume [Volume number of the journal]
"IP" => "issue", // Issue [The number of the issue, part, or supplement of the journal in which the article was published]
"PG" => "pages", // Pagination [The full pagination of the article]
// "" => "series_volume",
// "" => "series_issue",
// "" => "edition",
// "" => "medium",
// "" => "isbn",
"IS" => "issn", // ISSN [International Standard Serial Number of the journal]
// "" => "publisher",
// "PL" => "place", // Place of Publication [Journal's country of publication] (the "PL" field lists the *country* of publication but the *city* of publication should go into the "place" field)
"LA" => "language", // Language [The language in which the article was published]
// "" => "summary_language",
"MH" => "keywords", // MeSH Terms [NLM's Medical Subject Headings (MeSH) controlled vocabulary]
"OT" => "keywords", // Other Term [Non-MeSH subject terms (keywords) assigned by an organization identified by the Other Term Owner]
"OAB" => "abstract", // Other Abstract [Abstract supplied by an NLM collaborating organization] (since "AB" is defined later and neither "OAB" nor "AB" is listed in '$tagsMultipleArray', any content in "AB" will overwrite contents of "OAB")
"AB" => "abstract", // Abstract [English language abstract taken directly from the published article]
// "" => "area",
// "" => "expedition",
// "" => "conference",
"AID" => "doi", // Article Identifier [Article ID values supplied by the publisher may include the pii (controlled publisher identifier) or doi (Digital Object Identifier)] (using a search & replace action, we'll extract only the doi bit)
// "" => "url",
// "" => "file",
"GN" => "notes", // General Note [Supplemental or descriptive information related to the document]
"PMID" => "notes", // PubMed Unique Identifier [Unique number assigned to each PubMed citation]
"PMC" => "notes", // PubMed Central ID
// "" => "call_number",
// "" => "contribution_id",
// "" => "online_publication",
// "" => "online_citation",
// "" => "approved",
// "" => "orig_record",
//# "PUBM" => "online_publication", // Publishing Model [Article's model of print or electronic publishing]
"SO" => "source", // Source [Composite field containing bibliographic information] (the contents of this special field may be presented within the header message of 'record.php' for easy comparison with the extracted data)
// "CI" => "", // Copyright Information [Copyright statement provided by the publisher]
// "CIN" => "", // Comment In [Reference containing a comment about the article]
// "CON" => "", // Comment On [Reference upon which the article comments]
// "CRF" => "", // Corrected and republished from [Final, correct version of an article]
// "CRI" => "", // Corrected and republished in [Original article that was republished in corrected form]
// "DA" => "", // Date Created [Used for internal processing at NLM]
// "DCOM" => "", // Date Completed [Used for internal processing at NLM]
// "EDAT" => "", // Entrez Date [The date the citation was added to PubMed]
// "EFR" => "", // Erratum For [Cites the original article needing the correction]
// "EIN" => "", // Erratum In [Reference containing a published erratum to the article]
// "FIR" => "", // Full Investigator [Full investigator name]
// "FPS" => "", // Full Personal Name as Subject [Full Personal Name of the subject of the article]
// "GR" => "", // Grant Number [Research grant numbers, contract numbers, or both that designate financial support by any agency of the US PHS or Wellcome Trust]
// "GS" => "", // Gene Symbol [Abbreviated gene names (used 1991 through 1996)]
// "IR" => "", // Investigator [NASA-funded principal investigator]
// "IRAD" => "", // Investigator Affiliation [Affiliation of NASA-funded principal investigator]
// "JID" => "", // NLM Unique ID [Unique journal ID in NLM's catalog of books, journals, and audiovisuals]
// "LR" => "", // Last Revision Date [The date a change was made to the record]
// "MHDA" => "", // MeSH Date [The date MeSH terms were added to the citation. The MeSH date is the same as the Entrez date until MeSH are added]
// "OCI" => "", // Other Copyright Information [Copyright owner]
// "OID" => "", // Other ID [Identification numbers provided by organizations supplying citation data]
// "ORI" => "", // Original Report In [Cites the original article associated with the patient summary]
// "OTO" => "", // Other Term Owner [Organization that provided the Other Term data]
// "OWN" => "", // Owner [Organization acronym that supplied citation data]
// "PHST" => "", // Publication History Status Date [Publisher supplied dates regarding the article publishing process]
// "PS" => "", // Personal Name as Subject [Individual is the subject of the article]
// "PST" => "", // Publication Status [Publication status]
// "RF" => "", // Number of References [Number of bibliographic references for Review articles]
// "RIN" => "", // Retraction In [Retraction of the article]
// "RN" => "", // EC/RN Number [Number assigned by the Enzyme Commission to designate a particular enzyme or by the Chemical Abstracts Service for Registry Numbers]
// "ROF" => "", // Retraction Of [Article being retracted]
// "RPF" => "", // Republished From [Original article]
// "RPI" => "", // Republished In [Corrected and republished article]
// "SB" => "", // Subset [Journal or citation subset values representing specialized topics]
// "SFM" => "", // Space Flight Mission [NASA-supplied data space flight/mission name and/or number]
// "SI" => "", // Secondary Source Identifier [Identifies secondary source databanks and accession numbers of molecular sequences discussed in articles]
// "SPIN" => "", // Summary For Patients In [Cites a patient summary article]
// "STAT" => "", // Status Tag [Used for internal processing at NLM]
// "UIN" => "", // Update In [Update to the article]
// "UOF" => "", // Update Of [The article being updated]
);
// This array lists all MEDLINE tags that may occur multiple times:
$tagsMultipleArray = array(
"AU", // see above note for 'AU' at '$tagsToRefbaseFieldsArray'
"FAU",
"MH",
"OT",
"AID",
"PMID", // by allowing "PMID", "PMC" and "GN" to occur multiple times we can merge the contents of these fields into the 'notes' field
"PMC",
"GN"
);
// This array matches MEDLINE reference types with their corresponding refbase types:
// (MEDLINE types that are currently not supported in refbase will be taken as is but will get
// prefixed with an "Unsupported: " label; '#fallback#' in comments indicates a type mapping that
// is not a perfect match but as close as currently possible)
// "MEDLINE type" => "refbase type"
$referenceTypesToRefbaseTypesArray = array(
// "Journal Article" => "Journal Article", // NOTE: PubMed has *many* more types which should be dealt with (see e.g. <http://www.nlm.nih.gov/mesh/pubtypes2006.html> and <http://www.nlm.nih.gov/mesh/pubtypesg2003.html>)
"JOURNAL ARTICLE" => "Journal Article",
"REVIEW|Review" => "Journal Article", // in some records, "PT" may occur multiple times (e.g. as in "PT - Journal Article\nPT - Review"), and refbase currently uses the contents of the last "PT" as type
"Monograph|Account Books|Guidebooks|Handbooks|Textbooks" => "Book Whole",
"Congresses|Meeting Abstracts" => "Conference Article",
"Consensus Development Conference(, NIH)?" => "Conference Article",
"Newspaper Article" => "Newspaper Article",
"(Annual|Case|Technical) Reports?" => "Report",
"Manuscripts|Unpublished Works" => "Manuscript",
"Patents" => "Patent",
"Maps" => "Map",
"Editorial" => "Journal Article",
"Letter" => "Journal Article", // #fallback#
"Validation Studies" => "Journal Article",
"Research Support, N\.I\.H\., (Ex|In)tramural *" => "Journal Article",
"Research Support, (Non-)?U\.S\. Gov\'t(, (Non-)?P\.H\.S\.)? *" => "Journal Article"
);
// -----------------------------------------
// Split input text into individual records:
$recordArray = splitSourceText($sourceText, $recordDelimiter, false); // split on the "ER" (= end of record) tag that terminates every MEDLINE record
// Validate all records that shall be imported:
list($errors, $importRecordNumbersRecognizedFormatArray, $importRecordNumbersNotRecognizedFormatArray) = validateRecords($recordArray, $requiredTagsArray, $importRecordsRadio, $importRecordNumbersArray, $errors);
// Parse all records that shall be imported:
list($parsedRecordsArray, $recordsCount) = parseRecords($recordArray, "MEDLINE", $importRecordNumbersRecognizedFormatArray, $tagsToRefbaseFieldsArray, $tagsMultipleArray, $referenceTypesToRefbaseTypesArray, $fieldDelimiter, $dataDelimiter, $personDelimiter, $familyNameGivenNameDelimiter, $familyNameFirst, $shortenGivenNames, $transformCase, $postprocessorActionsArray, $preprocessorActionsArray);
// Build refbase import array:
$importDataArray = buildImportArray("refbase", // 'type' - the array format of the 'records' element
"1.0", // 'version' - the version of the given array structure
"http://refbase.net/import/medline/", // 'creator' - the name of the script/importer (preferably given as unique URI)
"Matthias Steffens", // 'author' - author/contact name of the person who's responsible for this script/importer
"refbase@extracts.de", // 'contact' - author's email/contact address
array('prefix_call_number' => "true"), // 'options' - array with settings that control the behaviour of the 'addRecords()' function
$parsedRecordsArray); // 'records' - array of record(s) (with each record being a sub-array of fields)
return array($importDataArray, $recordsCount, $importRecordNumbersRecognizedFormatArray, $importRecordNumbersNotRecognizedFormatArray, $errors);
}
// --------------------------------------------------------------------
// REFWORKS TO REFBASE
// This function converts records from RefWorks Tagged Format into the standard "refbase"
// array format which can be then imported by the 'addRecords()' function in 'include.inc.php'.
// More info on the RefWorks Tagged Format: <http://refworks.scholarsportal.info/Refworks/help/RefWorks_Tagged_Format.htm>
function refworksToRefbase($sourceText, $importRecordsRadio, $importRecordNumbersArray)
{
global $errors;
global $showSource;
// Define regular expression patterns that will facilitate parsing of RefWorks data:
// (patterns must be specified as perl-style regular expression, without the leading & trailing slashes, if not stated otherwise)
// Pattern by which the input text will be split into individual records:
$recordDelimiter = "\s*[\r\n][\r\n][\r\n]+\s*";
// Pattern by which records will be split into individual fields:
$fieldDelimiter = "[\r\n]+(?=\w\w )";
// Pattern by which fields will be split into their field label (tag) and field data:
$dataDelimiter = "(?<=^\w\w) ";
// Pattern by which multiple persons are separated within the author, editor or series editor fields of the source data:
// (Notes: - name standardization occurs after multiple author fields have been merged by '; '
// - the split pattern must be specified as perl-style regular expression (including the leading & trailing
// slashes) and may include mode modifiers (such as '/.../i' to perform a case insensitive match))
$personDelimiter = "/ *; */";
// Pattern by which a person's family name is separated from the given name (or initials):
// (the split pattern must be specified as perl-style regular expression (including the leading & trailing
// slashes) and may include mode modifiers (such as '/.../i' to perform a case insensitive match))
$familyNameGivenNameDelimiter = "/ *, */";
// Specifies whether the person's family name comes first within a person's name
// ('true' means that the family name is followed by the given name (or initials), 'false' means that the person's family name comes *after* the given name (or initials))
$familyNameFirst = true;
// Specifies whether a person's full given name(s) shall be shortened to initial(s):
// (Notes: - if set to 'true', given names will be abbreviated and initials will get normalized (meaning removal of extra whitespace, adding of dots between initials, etc)
// - if set to 'false', given names (and any initials) are taken as is
// - in your database, you should stick to either fully written given names OR initials; if you mix these, records won't get sorted correctly on citation output)
$shortenGivenNames = true;
// Specifies whether fields whose contents are entirely in upper case shall be transformed to title case ('true') or not ('false'):
$transformCase = true;
// Preprocessor actions:
// Defines search & replace 'actions' that will be applied to each record's raw source data if the pattern in the corresponding 'match' element is matched:
// (If you don't want to perform any preprocessor actions, specify an empty array, like: '$preprocessorActionsArray = array();'.
// Note that, in this case, the search patterns MUST include the leading & trailing slashes -- which is done to allow for mode modifiers such as 'imsxU'.)
// "/Search Pattern/" => "Replace Pattern"
$preprocessorActionsArray = array();
// Postprocessor actions:
// Defines search & replace 'actions' that will be applied to all those refbase fields that are listed in the corresponding 'fields' element:
// (If you don't want to perform any search and replace actions, specify an empty array, like: '$postprocessorActionsArray = array();'.
// Note that, in this case, the search patterns MUST include the leading & trailing slashes -- which is done to allow for mode modifiers such as 'imsxU'.)
// "/Search Pattern/" => "Replace Pattern"
$postprocessorActionsArray = array(
array(
'fields' => array("year"),
'actions' => array(
"/^.*?(\d{4}).*/" => "\\1" // for the 'year' field, extract any four-digit number (and discard everything else)
)
),
array(
'fields' => array("title"),
'actions' => array(
"/[,.;:!] *$/" => "" // remove any punctuation (except for question marks) from end of field contents
)
),
array(
'fields' => array("title", "abstract", "orig_title", "series_title", "abbrev_series_title", "notes"), // convert RefWorks font attributes (which RefWorks supports in title fields, notes, abstracts and user 1 - 5 fields)
'actions' => array(
"/0RW1S34RfeSDcfkexd09rT3(.+?)1RW1S34RfeSDcfkexd09rT3/" => "[super:\\1]", // replace RefWorks indicators for superscript text with refbase markup ('[super:...]')
"/0RW1S34RfeSDcfkexd09rT4(.+?)1RW1S34RfeSDcfkexd09rT4/" => "[sub:\\1]", // replace RefWorks indicators for subscript text with refbase markup ('[sub:...]')
"/0RW1S34RfeSDcfkexd09rT2(.+?)1RW1S34RfeSDcfkexd09rT2/" => "_\\1_", // replace RefWorks indicators for italic text with refbase markup ('_..._')
"/0RW1S34RfeSDcfkexd09rT0(.+?)1RW1S34RfeSDcfkexd09rT0/" => "**\\1**", // replace RefWorks indicators for bold text with refbase markup ('**...**')
"/0RW1S34RfeSDcfkexd09rT1(.+?)1RW1S34RfeSDcfkexd09rT1/" => "__\\1__" // replace RefWorks indicators for underline text with refbase markup ('__...__')
)
)
);
// This array lists patterns which match all RefWorks tags that must occur within a record to be recognized as valid RefWorks record:
// (Array keys must contain the tag name as it should be displayed to the user; as is the case with search & replace actions,
// the search patterns MUST include the leading & trailing slashes.)
// "tag display name" => "tag search pattern"
$requiredTagsArray = array(
"RT" => "/^RT /m"
);
// This array matches RefWorks tags with their corresponding refbase fields:
// (fields that are unsupported in either RefWorks or refbase are commented out)
// "RefWorks tag" => "refbase field" // RefWorks tag name (comment)
$tagsToRefbaseFieldsArray = array(
"RT" => "type", // Reference Type (IMPORTANT: the array element that maps to 'type' must be listed as the first element!)
// "" => "thesis",
"A1" => "author", // Primary Authors
"A2" => "editor", // Secondary Authors (Editors)
"A3" => "series_editor", // Tertiary Authors (Series Editors)
// "A4" => "", // Quaternary Authors (Translators)
// "A5" => "", // Quinary Authors (Compliers)
// "A6" => "", // Website Editors
"AD" => "address", // Author Address
// "" => "corporate_author",
"T1" => "title", // Primary Title
"OT" => "orig_title", // Original Foreign Title
// "ST" => "", // Shortened Title
// "WT" => "", // Website Title
// "FD" => "", // Publication Data, Free Form (this field is used for date information such as a season or month and day; year data is solely placed in the year field, i.e., "YR 2003")
"YR" => "year", // Year
// "RD" => "", // Retrieved Date
// "WV" => "", // Website Version
// "WP" => "", // Date of Electronic Publication
"JF" => "publication", // Periodical name: full format
"JO" => "abbrev_journal", // Periodical name: standard abbreviation
"T2" => array("Book, Section" => "publication", "Other" => "series_title"), // Secondary Title
"T3" => "abbrev_series_title", // Tertiary Title
"VO" => "volume", // Volume
// "NV" => "", // Number of volumes
"IS" => "issue", // Issue
"SP" => "startPage", // Start Page (contents of the special fields 'startPage' and 'endPage' will be merged into a range and copied to the refbase 'pages' field)
"OP" => "endPage", // Either [1] "endPage", Other Pages ('SP' is the tag for the starting page and should only contain this information; the 'OP' tag is used for any additional pages or page information) or [2] Original publication
// "" => "series_volume", // (for 'series_volume' and 'series_issue', some magic will be applied within the 'parseRecords()' function)
// "" => "series_issue",
"PB" => "publisher", // Publisher
"PP" => "place", // Place of Publication
"ED" => "edition", // Edition
// "" => "medium",
"SN" => array("Book, Section" => "isbn", "Book, Edited" => "isbn", "Book, Whole" => "isbn", "Dissertation" => "isbn", "Dissertation/Thesis" => "isbn", "Other" => "issn"), // Book Whole & Book Chapter: ISBN; Other reference types: ISSN
"LA" => "language", // Language
// "" => "summary_language",
"K1" => "keywords", // Keywords
"AB" => "abstract", // Abstract
// "" => "area",
// "" => "expedition",
// "" => "conference",
"DO" => "doi", // Digital Object Identifier
"LK" => "url", // Links
"UL" => "url", // URL
// "" => "file", // Link to PDF
// "" => "related", // Related Records
"NO" => "notes", // Notes
"ID" => "call_number", // Reference Identifier (NOTE: if no other field gets mapped to the 'cite_key' field, the contents of the 'call_number' field will be also copied to the 'cite_key' field of the currently logged-in user)
"CN" => "notes", // Call Number (if 'ID' would be mapped to 'cite_key', contents of this field could go into the 'call_number' field)
"IP" => "notes", // Identifying Phrase (NOTE: should we rather put the contents of this field into the 'cite_key' field?)
// "U1" => "", // User definable 1
// "U2" => "", // User definable 2
// "U3" => "", // User definable 3
// "U4" => "", // User definable 4
// "U5" => "", // User definable 5
// "" => "contribution_id",
// "" => "online_publication",
// "" => "online_citation",
// "" => "approved",
// "" => "orig_record",
// "" => "copy", // Reprint status
"AV" => "notes", // Availability
// "AN" => "", // Accession Number
// "CA" => "", // Caption
// "CL" => "", // Classification
// "SF" => "", // Subfile/Database
// "DB" => "", // Database
// "DS" => "", // Data Source
// "SL" => "", // Sponsoring Library
// "LL" => "", // Sponsoring Library Location
// "CR" => "", // Cited References
);
// This array lists all RefWorks tags that may occur multiple times:
$tagsMultipleArray = array(
"A1",
"A2",
"A3",
// "A4",
// "A5",
// "A6",
"K1",
// "LK", // currently, refbase does only support one link per record
// "UL", // currently, refbase does only support one URL per record
"ID",
"CN",
"IP",
"NO",
"AV"
);
// This array matches RefWorks reference types with their corresponding refbase types:
// (RefWorks types that are currently not supported in refbase will be taken as is but will get
// prefixed with an "Unsupported: " label; '#fallback#' in comments indicates a type mapping that
// is not a perfect match but as close as currently possible)
// "RefWorks type" => "refbase type" // name of RefWorks reference type (comment)
$referenceTypesToRefbaseTypesArray = array(
"Abstract" => "Abstract", // Abstract
"Artwork" => "Unsupported: Artwork", // Artwork
"Bills\/Resolutions" => "Unsupported: Bills/Resolutions", // Bills/Resolutions
"Book,? (Section|Chapter)" => "Book Chapter", // Book, Section
"Book, Edited" => "Book Whole", // Book, Edited (#fallback#)
"Book, Whole" => "Book Whole", // Book, Whole
"Case\/Court Decisions" => "Unsupported: Case/Court Decisions", // Case/Court Decisions
"Computer Program" => "Software", // Computer Program
"Conference Proceeding" => "Conference Article", // Conference Proceeding
"Dissertation(\/Thesis)?" => "Thesis", // Dissertation/Thesis (function 'parseRecords()' will set the special type 'Thesis' back to 'Book Whole' and adopt the refbase 'thesis' field)
"Dissertation(\/Thesis)?, Unpublished" => "Thesis", // Dissertation/Thesis, Unpublished (#fallback#) (function 'parseRecords()' will set the special type 'Thesis' back to 'Book Whole' and adopt the refbase 'thesis' field)
"Generic" => "Miscellaneous", // Generic
"Grant" => "Unsupported: Grant", // Grant
"Hearing" => "Unsupported: Hearing", // Hearing
"Journal" => "Journal Article", // Journal
"Journal, Electronic" => "Journal Article", // Journal, Electronic (#fallback#) (function 'parseRecords()' should set the 'online_publication' field accordingly)
"Laws\/Statutes" => "Unsupported: Laws/Statutes", // Laws/Statutes
"Magazine Article" => "Magazine Article", // Magazine Article
"Map" => "Map", // Map
"Monograph" => "Book Whole", // Monograph (#fallback#)
"Motion Picture" => "Unsupported: Motion Picture", // Motion Picture
"Music Score" => "Unsupported: Music Score", // Music Score
"Newspaper Article" => "Newspaper Article", // Newspaper Article
"Online Discussion Forum" => "Unsupported: Online Discussion Forum", // Online Discussion Forum
"Patent" => "Patent", // Patent
"Personal Communication" => "Unsupported: Personal Communication", // Personal Communication
"Report" => "Report", // Report
"Sound Recording" => "Unsupported: Sound Recording", // Sound Recording
"Thesis(\/Dissertation)?" => "Thesis", // Dissertation/Thesis (function 'parseRecords()' will set the special type 'Thesis' back to 'Book Whole' and adopt the refbase 'thesis' field)
"Unpublished Material" => "Manuscript", // Unpublished Material (#fallback#)
"Video\/DVD" => "Unsupported: Video/DVD", // Video/DVD
"Web Page" => "Unsupported: Web Page" // Web Page
);
// -----------------------------------------
// Split input text into individual records:
$recordArray = splitSourceText($sourceText, $recordDelimiter, false); // split on the "ER" (= end of record) tag that terminates every RefWorks record
// Validate all records that shall be imported:
list($errors, $importRecordNumbersRecognizedFormatArray, $importRecordNumbersNotRecognizedFormatArray) = validateRecords($recordArray, $requiredTagsArray, $importRecordsRadio, $importRecordNumbersArray, $errors);
// Parse all records that shall be imported:
list($parsedRecordsArray, $recordsCount) = parseRecords($recordArray, "RefWorks", $importRecordNumbersRecognizedFormatArray, $tagsToRefbaseFieldsArray, $tagsMultipleArray, $referenceTypesToRefbaseTypesArray, $fieldDelimiter, $dataDelimiter, $personDelimiter, $familyNameGivenNameDelimiter, $familyNameFirst, $shortenGivenNames, $transformCase, $postprocessorActionsArray, $preprocessorActionsArray);
// Build refbase import array:
$importDataArray = buildImportArray("refbase", // 'type' - the array format of the 'records' element
"1.0", // 'version' - the version of the given array structure
"http://refbase.net/import/refworks/", // 'creator' - the name of the script/importer (preferably given as unique URI)
"Matthias Steffens", // 'author' - author/contact name of the person who's responsible for this script/importer
"refbase@extracts.de", // 'contact' - author's email/contact address
array('prefix_call_number' => "true"), // 'options' - array with settings that control the behaviour of the 'addRecords()' function
$parsedRecordsArray); // 'records' - array of record(s) (with each record being a sub-array of fields)
return array($importDataArray, $recordsCount, $importRecordNumbersRecognizedFormatArray, $importRecordNumbersNotRecognizedFormatArray, $errors);
}
// --------------------------------------------------------------------
// SCIFINDER TO REFBASE
// This function converts records from SciFinder (<http://www.cas.org/SCIFINDER/>) Tagged Format
// into the standard "refbase" array format which can be then imported by the 'addRecords()' function
// in 'include.inc.php'.
function scifinderToRefbase($sourceText, $importRecordsRadio, $importRecordNumbersArray)
{
global $alnum, $alpha, $cntrl, $dash, $digit, $graph, $lower, $print, $punct, $space, $upper, $word, $patternModifiers; // defined in 'transtab_unicode_charset.inc.php' and 'transtab_latin1_charset.inc.php'
global $errors;
global $showSource;
// The SciFinder format uses variable-length field label names, which makes it
// impossible to match field labels using regular expressions with perl-style
// look-behinds (such as '(?<=...)'). This poses a problem when specifying an
// appropriate regex pattern for variable '$dataDelimiter'. Therefore, we'll
// preprocess the '$sourceText' so that delimiters between field labels and
// field data can be easily matched.
$sourceText = preg_replace("/^(FIELD [^:\r\n]+):/m", "\\1__dataDelimiter__", $sourceText); // replace the first colon (":"), which separates a field label from its data, with a custom string ("__dataDelimiter__")
// Define regular expression patterns that will facilitate parsing of SciFinder data:
// (patterns must be specified as perl-style regular expression, without the leading & trailing slashes, if not stated otherwise)
// Pattern by which the input text will be split into individual records:
$recordDelimiter = "\s*(START_RECORD[\r\n]+|[\r\n]+END_RECORD)\s*";
// Pattern by which records will be split into individual fields:
$fieldDelimiter = "[\r\n]+FIELD *";
// Pattern by which fields will be split into their field label (tag) and field data:
$dataDelimiter = " *__dataDelimiter__ *";
// Pattern by which multiple persons are separated within the author, editor or series editor fields of the source data:
// (Notes: - name standardization occurs after multiple author fields have been merged by '; '
// - the split pattern must be specified as perl-style regular expression (including the leading & trailing
// slashes) and may include mode modifiers (such as '/.../i' to perform a case insensitive match))
$personDelimiter = "/ *; */";
// Pattern by which a person's family name is separated from the given name (or initials):
// (the split pattern must be specified as perl-style regular expression (including the leading & trailing
// slashes) and may include mode modifiers (such as '/.../i' to perform a case insensitive match))
$familyNameGivenNameDelimiter = "/ *, */";
// Specifies whether the person's family name comes first within a person's name
// ('true' means that the family name is followed by the given name (or initials), 'false' means that the person's family name comes *after* the given name (or initials))
$familyNameFirst = true;
// Specifies whether a person's full given name(s) shall be shortened to initial(s):
// (Notes: - if set to 'true', given names will be abbreviated and initials will get normalized (meaning removal of extra whitespace, adding of dots between initials, etc)
// - if set to 'false', given names (and any initials) are taken as is
// - in your database, you should stick to either fully written given names OR initials; if you mix these, records won't get sorted correctly on citation output)
$shortenGivenNames = true;
// Specifies whether fields whose contents are entirely in upper case shall be transformed to title case ('true') or not ('false'):
$transformCase = true;
// Preprocessor actions:
// Defines search & replace 'actions' that will be applied to each record's raw source data if the pattern in the corresponding 'match' element is matched:
// (If you don't want to perform any preprocessor actions, specify an empty array, like: '$preprocessorActionsArray = array();'.
// Note that, in this case, the search patterns MUST include the leading & trailing slashes -- which is done to allow for mode modifiers such as 'imsxU'.)
// "/Search Pattern/" => "Replace Pattern"
$preprocessorActionsArray = array();
// Postprocessor actions:
// Defines search & replace 'actions' that will be applied to all those refbase fields that are listed in the corresponding 'fields' element:
// (If you don't want to perform any search and replace actions, specify an empty array, like: '$postprocessorActionsArray = array();'.
// Note that, in this case, the search patterns MUST include the leading & trailing slashes -- which is done to allow for mode modifiers such as 'imsxU'.)
// "/Search Pattern/" => "Replace Pattern"
$postprocessorActionsArray = array(
array(
'fields' => array("year"),
'actions' => array(
"/^.*?(\d{4}).*/" => "\\1", // for the 'year' field, extract any four-digit number (and discard everything else)
"/^\D+$/" => "" // clear the 'year' field if it doesn't contain any number
)
),
array(
'fields' => array("pages"),
'actions' => array(
"/(\d+ *pp?)\./" => "\\1" // strip any trailing dots from "xx pp." or "xx p." in the 'pages' field
)
),
array(
'fields' => array("title", "address"),
'actions' => array(
"/[,.;:!] *$/" => "", // remove any punctuation (except for question marks) from end of field contents
"/,(?! )/" => ", " // add a space after a comma if missing (this mainly regards the 'Corporate Source' -> 'address' field)
)
),
array(
'fields' => array("abstract"),
'actions' => array(
'/\\\\"/' => '"', // convert escaped quotes (\") into unescaped quotes (")
"/ *\[on SciFinder \(R\)\]$/" => "" // remove attribution string " [on SciFinder (R)]" from end of field contents
)
),
array(
'fields' => array("language"),
'actions' => array(
"/^[$lower$punct ]+(?=[$upper][$lower]+)/$patternModifiers" => "", // remove any all-lowercase prefix string (so that field contents such as "written in English." get reduced to "English.")
"/language unavailable/" => "", // remove "language unavailable" string
"/[$punct] *$/$patternModifiers" => "" // remove any punctuation from end of field contents
)
),
array(
'fields' => array("notes"),
'actions' => array(
"/^Can (\d+)/" => "CAN:\\1", // convert any existing "CAN " prefix in front of any number that's at the beginning of the 'notes' field (which originated from the SciFinder 'Chemical Abstracts Number(CAN)' field)
"/^(\d+)/" => "CAN:\\1" // insert a "CAN:" prefix in front of any number that's at the beginning of the 'notes' field (we map the SciFinder 'Chemical Abstracts Number(CAN)' field to the 'notes' field)
)
)
);
// This array lists patterns which match all SciFinder tags that must occur within a record to be recognized as valid SciFinder record:
// (Array keys must contain the tag name as it should be displayed to the user; as is the case with search & replace actions,
// the search patterns MUST include the leading & trailing slashes.)
// "tag display name" => "tag search pattern"
$requiredTagsArray = array(
"Document Type" => "/^FIELD Document Type/m"
);
// This array matches SciFinder tags with their corresponding refbase fields:
// (fields that are unsupported in either SciFinder or refbase are commented out)
// "SciFinder tag" => "refbase field" // SciFinder tag name (comment)
$tagsToRefbaseFieldsArray = array(
"Document Type" => "type", // Document Type (IMPORTANT: the array element that maps to 'type' must be listed as the first element!)
// "" => "thesis",
"Author" => "author", // Primary Authors
// "" => "editor", // Secondary Authors (Editors)
// "" => "series_editor", // Tertiary Authors (Series Editors)
"Corporate Source" => "address", // Corporate Source
// "" => "corporate_author", // Corporate Author
"Title" => "title", // Primary Title
// "" => "orig_title", // Original Foreign Title
"Publication Year" => "year", // Publication Year
"Publication Date" => "year", // Publication Date
"Journal Title" => "publication", // Periodical name: full format
// "" => "abbrev_journal", // Periodical name: standard abbreviation
// "" => array("Book, Section" => "publication", "Other" => "series_title"), // Secondary Title
// "" => "abbrev_series_title", // Tertiary Title
"Volume" => "volume", // Volume
"Issue" => "issue", // Issue
"Page" => "pages", // Page
// "" => "series_volume", // (for 'series_volume' and 'series_issue', some magic will be applied within the 'parseRecords()' function)
// "" => "series_issue",
// "" => "publisher", // Publisher
// "" => "place", // Place of Publication
// "" => "edition", // Edition
// "" => "medium", // Medium
"Internat.Standard Doc. Number" => array("Book, Section" => "isbn", "Book, Edited" => "isbn", "Book" => "isbn", "Dissertation" => "isbn", "Dissertation/Thesis" => "isbn", "Other" => "issn"), // Book Whole & Book Chapter: ISBN; Other reference types: ISSN
"Language" => "language", // Language
// "" => "summary_language", // Summary Language
"Index Terms" => "keywords", // Index Terms
// "Index Terms(2)" => "keywords", // Index Terms(2)
"Abstract" => "abstract", // Abstract
// "" => "area",
// "" => "expedition",
// "" => "conference",
// "" => "doi", // Digital Object Identifier
"URL" => "url", // URL
// "" => "file", // Link to PDF
// "" => "related", // Related Records
// "" => "call_number", // Call Number
"Chemical Abstracts Number(CAN)" => "notes", // Chemical Abstracts Number(CAN)
// "" => "contribution_id",
// "" => "online_publication",
// "" => "online_citation",
// "" => "approved",
// "" => "orig_record",
// "" => "copy", // Reprint status
// "Copyright" => "", // Copyright
// "Database" => "", // Database
// "Accession Number" => "", // Accession Number
// "Section Code" => "", // Section Code
// "Section Title" => "", // Section Title
// "CA Section Cross-references" => "", // CA Section Cross-references
// "CODEN" => "", // CODEN
// "CAS Registry Numbers" => "", // CAS Registry Numbers
// "Supplementary Terms" => "", // Supplementary Terms
// "PCT Designated States" => "", // PCT Designated States
// "PCT Reg. Des. States" => "", // PCT Reg. Des. States
// "Reg.Pat.Tr.Des.States" => "", // Reg.Pat.Tr.Des.States
// "Main IPC" => "", // Main IPC
// "IPC" => "", // IPC
// "Secondary IPC" => "", // Secondary IPC
// "Additional IPC" => "", // Additional IPC
// "Index IPC" => "", // Index IPC
// "Inventor Name" => "", // Inventor Name
// "National Patent Classification" => "", // National Patent Classification
// "Patent Application Country" => "", // Patent Application Country
// "Patent Application Date" => "", // Patent Application Date
// "Patent Application Number" => "", // Patent Application Number
// "Patent Assignee" => "", // Patent Assignee
// "Patent Country" => "", // Patent Country
// "Patent Kind Code" => "", // Patent Kind Code
// "Patent Number" => "", // Patent Number
// "Priority Application Country" => "", // Priority Application Country
// "Priority Application Number" => "", // Priority Application Number
// "Priority Application Date" => "", // Priority Application Date
// "Citations" => "", // Citations
);
// This array lists all SciFinder tags that may occur multiple times:
$tagsMultipleArray = array(
// "Chemical Abstracts Number(CAN)",
// "Index Terms", // by allowing "Index Terms" and "Index Terms(2)" to occur multiple times we can merge contents of both of these fields into the 'keywords' field
// "Index Terms(2)",
"Publication Year", // by allowing "Publication Year" and "Publication Date" to occur multiple times we can merge contents of both of these fields into the 'year' field (then, we'll extract the first four-digit number from it)
"Publication Date"
);
// This array matches SciFinder reference types with their corresponding refbase types:
// (SciFinder types that are currently not supported in refbase will be taken as is but will get
// prefixed with an "Unsupported: " label; '#fallback#' in comments indicates a type mapping that
// is not a perfect match but as close as currently possible)
// (NOTE: the commented reference types are NOT from SciFinder but are remains from the 'refworksToRefbase()' function!)
// "SciFinder type" => "refbase type" // name of SciFinder reference type (comment)
$referenceTypesToRefbaseTypesArray = array(
// "Abstract" => "Abstract", // Abstract
// "Artwork" => "Unsupported: Artwork", // Artwork
// "Bills\/Resolutions" => "Unsupported: Bills/Resolutions", // Bills/Resolutions
// "Book,? (Section|Chapter)" => "Book Chapter", // Book, Section
// "Book, Edited" => "Book Whole", // Book, Edited (#fallback#)
"Book(;.*)?" => "Book Whole", // Book
// "Case\/Court Decisions" => "Unsupported: Case/Court Decisions", // Case/Court Decisions
// "Computer Program" => "Software", // Computer Program
// "Conference Proceeding" => "Conference Article", // Conference Proceeding
// "Dissertation(\/Thesis)?" => "Thesis", // Dissertation/Thesis (function 'parseRecords()' will set the special type 'Thesis' back to 'Book Whole' and adopt the refbase 'thesis' field)
// "Dissertation(\/Thesis)?, Unpublished" => "Thesis", // Dissertation/Thesis, Unpublished (#fallback#) (function 'parseRecords()' will set the special type 'Thesis' back to 'Book Whole' and adopt the refbase 'thesis' field)
// "Generic" => "Miscellaneous", // Generic
// "Grant" => "Unsupported: Grant", // Grant
// "Hearing" => "Unsupported: Hearing", // Hearing
"Journal(;.*)?" => "Journal Article", // Journal
// "Journal, Electronic" => "Journal Article", // Journal, Electronic (#fallback#) (function 'parseRecords()' should set the 'online_publication' field accordingly)
// "Laws\/Statutes" => "Unsupported: Laws/Statutes", // Laws/Statutes
// "Magazine Article" => "Magazine Article", // Magazine Article
// "Map" => "Map", // Map
// "Monograph" => "Book Whole", // Monograph (#fallback#)
// "Motion Picture" => "Unsupported: Motion Picture", // Motion Picture
// "Music Score" => "Unsupported: Music Score", // Music Score
// "Newspaper Article" => "Newspaper Article", // Newspaper Article
// "Online Discussion Forum" => "Unsupported: Online Discussion Forum", // Online Discussion Forum
// "Patent" => "Patent", // Patent
// "Personal Communication" => "Unsupported: Personal Communication", // Personal Communication
"Report(;.*)?" => "Report", // Report
// "Sound Recording" => "Unsupported: Sound Recording", // Sound Recording
// "Thesis(\/Dissertation)?" => "Thesis", // Dissertation/Thesis (function 'parseRecords()' will set the special type 'Thesis' back to 'Book Whole' and adopt the refbase 'thesis' field)
"Preprint" => "Manuscript", // Preprint (#fallback#)
// "Video\/DVD" => "Unsupported: Video/DVD", // Video/DVD
// "Web Page" => "Unsupported: Web Page" // Web Page
);
// Other SciFinder Document Types which I've encountered so far:
// "General Review" => "" // General Review
// "Online Computer File" => "" // Online Computer File
// -----------------------------------------
// Split input text into individual records:
$recordArray = splitSourceText($sourceText, $recordDelimiter, false); // split on the "START_RECORD"/"END_RECORD" tags that delimit every SciFinder record
// Validate all records that shall be imported:
list($errors, $importRecordNumbersRecognizedFormatArray, $importRecordNumbersNotRecognizedFormatArray) = validateRecords($recordArray, $requiredTagsArray, $importRecordsRadio, $importRecordNumbersArray, $errors);
// Parse all records that shall be imported:
list($parsedRecordsArray, $recordsCount) = parseRecords($recordArray, "SciFinder", $importRecordNumbersRecognizedFormatArray, $tagsToRefbaseFieldsArray, $tagsMultipleArray, $referenceTypesToRefbaseTypesArray, $fieldDelimiter, $dataDelimiter, $personDelimiter, $familyNameGivenNameDelimiter, $familyNameFirst, $shortenGivenNames, $transformCase, $postprocessorActionsArray, $preprocessorActionsArray);
// Build refbase import array:
$importDataArray = buildImportArray("refbase", // 'type' - the array format of the 'records' element
"1.0", // 'version' - the version of the given array structure
"http://refbase.net/import/scifinder/", // 'creator' - the name of the script/importer (preferably given as unique URI)
"Matthias Steffens", // 'author' - author/contact name of the person who's responsible for this script/importer
"refbase@extracts.de", // 'contact' - author's email/contact address
array('prefix_call_number' => "true"), // 'options' - array with settings that control the behaviour of the 'addRecords()' function
$parsedRecordsArray); // 'records' - array of record(s) (with each record being a sub-array of fields)
return array($importDataArray, $recordsCount, $importRecordNumbersRecognizedFormatArray, $importRecordNumbersNotRecognizedFormatArray, $errors);
}
// --------------------------------------------------------------------
// IDENTIFY SOURCE FORMAT
// This function tries to identify the format of the input text:
function identifySourceFormat($sourceText)
{
$sourceFormat = "";
// CSA format:
if (preg_match("/^Record \d+ of \d+/m", $sourceText) AND preg_match("/^SO: Source *[\r\n]+ {4,4}/m", $sourceText)) // CSA records must at least start with a record identifier ("Record x of xx") and contain the "SO: Source" tag
$sourceFormat = "CSA";
// PubMed MEDLINE format:
elseif (preg_match("/^PMID- /m", $sourceText) AND preg_match("/^PT - /m", $sourceText)) // PubMed MEDLINE records must at least contain the "PMID" and "PT" tags
$sourceFormat = "Pubmed Medline";
// PubMed XML format:
elseif (preg_match("/<PubmedArticle[^<>\r\n]*>/i", $sourceText) AND preg_match("/<\/PubmedArticle>/", $sourceText)) // PubMed XML records must at least contain the "<PubmedArticle>...</PubmedArticle>" root element
$sourceFormat = "Pubmed XML";
// ISI Web of Science format:
elseif (preg_match("/^PT /m", $sourceText) AND preg_match("/^SO /m", $sourceText) AND preg_match("/^ER *[\r\n]/m", $sourceText)) // ISI records must at least contain the "PT" and "SO" tags and end with an "ER" tag
$sourceFormat = "ISI";
// RIS format:
elseif (preg_match("/^TY - /m", $sourceText) AND preg_match("/^ER -/m", $sourceText)) // RIS records must at least start with the "TY" tag and end with an "ER" tag (we'll only check for their presence, though)
$sourceFormat = "RIS";
// RefWorks format:
elseif (preg_match("/^RT /m", $sourceText)) // RefWorks records must at least start with the "RT" tag (we'll only check for its presence, though)
$sourceFormat = "RefWorks";
// SciFinder format:
elseif (preg_match("/^START_RECORD/m", $sourceText) AND preg_match("/^END_RECORD/m", $sourceText)) // SciFinder records must at least start with the "START_RECORD" tag and end with an "END_RECORD" tag (we'll only check for their presence, though)
$sourceFormat = "SciFinder";
// Copac format:
elseif (preg_match("/^TI- /m", $sourceText) AND preg_match("/^HL- /m", $sourceText)) // Copac records must at least contain the "TI" and "HL" tags
$sourceFormat = "Copac";
// Endnote format:
elseif (preg_match("/^%0 /m", $sourceText)) // Endnote records must at least contain the "%0" tag
$sourceFormat = "Endnote"; // Endnote tagged text aka Endnote Refer
// MODS XML format:
elseif (preg_match("/<mods[^<>\r\n]*>/i", $sourceText) AND preg_match("/<\/mods>/", $sourceText)) // MODS XML records must at least contain the "<mods>...</mods>" root element
$sourceFormat = "MODS XML";
// Endnote XML format:
elseif (preg_match("/<xml>[^<>]*?<records>[^<>]*?<record>/mi", $sourceText)) // Endnote XML records must at least contain the elements "<xml>...<records>...<record>"
$sourceFormat = "Endnote XML";
// BibTeX format:
elseif (preg_match("/^@\w+\s*\{[^ ,\r\n]* *, *[\r\n]/m", $sourceText)) // BibTeX records must start with the "@" sign, followed by a type specifier and an optional cite key (such as in '@article{steffens1988,')
$sourceFormat = "BibTeX";
// CrossRef "unixref" XML format:
// TODO: improve match
elseif (preg_match("/<doi_records[^<>\r\n]*>/i", $sourceText) AND preg_match("/<\/doi_records>/", $sourceText)) // CrossRef XML records must at least contain the "<doi_records>...</doi_records>" root element
$sourceFormat = "CrossRef XML";
// arXiv.org Atom XML OpenSearch format:
// TODO: add regex pattern that matches arXiv.org Atom feeds
return $sourceFormat;
}
// --------------------------------------------------------------------
// IDENTIFY SOURCE ID
// This function tries to identify the type of the IDs contained in the input string:
// TODO:
// - modify the code so that '$sourceIDs' can contain a mixture of any supported IDs
// - after splitting on whitespace, verify ALL items and check whether they match one of the recognized ID patterns
// - better identification/verification of OpenURLs
// - to support OpenURL context objects from COinS or Atom XML, we need to decode ampersand characters ('&amp;' -> '&'),
// and allow for OpenURLs that don't start with '?' or '&'
function identifySourceID($sourceIDs)
{
$idFormat = "";
// DOIs:
if (preg_match("#(?<=^|\s)(doi:|http://dx\.doi\.org/)?10\.\d{4}/\S+?(?=$|\s)#i", $sourceIDs))
$idFormat = "CrossRef XML";
// OpenURLs:
elseif (preg_match("#(?<=^|\s)(openurl:|http://.+?(?=\?))?.*?(?<=[?&])ctx_ver=Z39\.88-2004(?=&|$).*?(?=$|\s)#i", $sourceIDs)) // OpenURLs must contain the 'ctx_ver=Z39.88-2004' key/value pair
$idFormat = "CrossRef XML";
// arXiv IDs:
elseif (preg_match("#(?<=^|\s)(arXiv:|http://arxiv\.org/abs/)?([\w.-]+/\d{7}|\d{4}\.\d{4,})(v\d+)?(?=$|\s)#i", $sourceIDs))
$idFormat = "arXiv XML";
// PubMed IDs:
elseif (preg_match("/(?<=^|\s)\d+(?=$|\s)/", $sourceIDs))
$idFormat = "Pubmed Medline";
return $idFormat;
}
// --------------------------------------------------------------------
// SPLIT SOURCE TEXT
// This function splits the input text at the specified delimiter and returns an array of records:
function splitSourceText($sourceText, $splitPattern, $returnEmptyElements)
{
if ($returnEmptyElements) // include empty elements:
$recordArray = preg_split("/" . $splitPattern . "/", $sourceText);
else // omit empty elements:
$recordArray = preg_split("/" . $splitPattern . "/", $sourceText, -1, PREG_SPLIT_NO_EMPTY); // the 'PREG_SPLIT_NO_EMPTY' flag causes only non-empty pieces to be returned
return $recordArray;
}
// --------------------------------------------------------------------
// VALIDATE RECORDS
// This function takes an array of records containing the source data (as tagged text) and
// checks for each record if any of the required fields (given in '$requiredTagsArray') are missing:
function validateRecords($recordArray, $requiredTagsArray, $importRecordsRadio, $importRecordNumbersArray, $errors)
{
// count how many records are available:
$recordsCount = count($recordArray);
$importRecordNumbersRecognizedFormatArray = array(); // initialize array variable which will hold all record numbers of those records that shall be imported AND which were of a recognized format
$importRecordNumbersNotRecognizedFormatArray = array(); // same for all records that shall be imported BUT which had an UNrecognized format
for ($i=0; $i<$recordsCount; $i++) // for each record...
{
if (($importRecordsRadio == "only") AND (!in_array(($i+1), $importRecordNumbersArray))) // if we're NOT supposed to import this record... ('$i' starts with 0 so we have to add 1 to point to the correct record number)
{
continue; // process next record (if any)
}
else // ...validate the format of the current record:
{
$missingTagsArray = array();
// check for required fields:
if (!empty($recordArray[$i]))
foreach ($requiredTagsArray as $requiredTagName => $requiredTagPattern)
if (!preg_match($requiredTagPattern, $recordArray[$i])) // if required field is missing
$missingTagsArray[] = $requiredTagName;
// we assume a single record as valid if the '$recordArray[$i]' variable is not empty
// and if all tag search patterns in '$requiredTagsArray' were matched:
if (!empty($recordArray[$i]) AND empty($missingTagsArray))
{
$importRecordNumbersRecognizedFormatArray[] = $i + 1; // append this record number to the list of numbers whose record format IS recognized ('$i' starts with 0 so we have to add 1 to point to the correct record number)
}
else // unrecognized record format
{
$importRecordNumbersNotRecognizedFormatArray[] = $i + 1; // append this record number to the list of numbers whose record format is NOT recognized
// prepare an appropriate error message:
$errorMessage = "Record " . ($i + 1) . ":";
// Handle PubMed Medline errors:
// TODO: - improve identification of Medline errors
// - handle PubMed XML
if (preg_match("/^\s*<html>/i", $recordArray[$i]) AND preg_match("/Error occurred:/", $recordArray[$i])) // a PubMed error occurred, probably because an unrecognized PubMed ID was given
$errorMessage .= preg_replace("/.*Error occurred: *([^<>]+).*/s", " PubMed error: \\1.", $recordArray[$i]); // attempt to extract PubMed error message
else
{
$errorMessage .= " Unrecognized data format!";
if (!empty($missingTagsArray)) // some required fields were missing
{
if (count($missingTagsArray) == 1) // one field missing
$errorMessage .= " Required field missing: " . $missingTagsArray[0];
else // several fields missing
$errorMessage .= " Required fields missing: " . implode(', ', $missingTagsArray);
}
}
if (!isset($errors["sourceText"]))
$errors["sourceText"] = $errorMessage;
else
$errors["sourceText"] = $errors["sourceText"] . "<br>" . $errorMessage;
}
}
}
return array($errors, $importRecordNumbersRecognizedFormatArray, $importRecordNumbersNotRecognizedFormatArray);
}
// --------------------------------------------------------------------
// PARSE RECORDS
// This function processes an array of records containing the source data (as tagged text) and
// returns an array of records where each record contains an array of extracted field data:
function parseRecords($recordArray, $recordFormat, $importRecordNumbersRecognizedFormatArray, $tagsToRefbaseFieldsArray, $tagsMultipleArray, $referenceTypesToRefbaseTypesArray, $fieldDelimiter, $dataDelimiter, $personDelimiter, $familyNameGivenNameDelimiter, $familyNameFirst, $shortenGivenNames, $transformCase, $postprocessorActionsArray, $preprocessorActionsArray)
{
global $alnum, $alpha, $cntrl, $dash, $digit, $graph, $lower, $print, $punct, $space, $upper, $word, $patternModifiers; // defined in 'transtab_unicode_charset.inc.php' and 'transtab_latin1_charset.inc.php'
global $showSource;
$parsedRecordsArray = array(); // initialize array variable which will hold parsed data of all records that shall be imported
$recordsCount = count($recordArray); // count how many records are available
// LOOP OVER EACH RECORD:
for ($i=0; $i<$recordsCount; $i++) // for each record...
{
// if we're NOT supposed to import this record (because it was either not selected by the user -OR- because it did contain an unrecognized data format)
if (!in_array(($i+1), $importRecordNumbersRecognizedFormatArray)) // '$i' starts with 0 so we have to add 1 to point to the correct record number
{
continue; // process next record (if any)
}
else // ...import the current record:
{
// PRE-PROCESS FIELD DATA:
// apply search & replace 'actions' to each record's raw source data:
foreach ($preprocessorActionsArray as $thisMatchActionsArray)
if (preg_match($thisMatchActionsArray['match'], $recordArray[$i]))
$recordArray[$i] = searchReplaceText($thisMatchActionsArray['actions'], $recordArray[$i], true); // function 'searchReplaceText()' is defined in 'include.inc.php'
// split each record into its fields:
$fieldArray = preg_split("/" . $fieldDelimiter . "/", $recordArray[$i]);
// initialize some variables:
$fieldParametersArray = array(); // setup an empty array (it will hold all fields that were extracted for a given record)
$tagContentsMultipleArray = array(); // this array will hold individual items of tags that can occur multiple times
// LOOP OVER EACH FIELD:
foreach ($fieldArray as $singleField) // for each field within the current record...
{
// split each field into its tag and its field data:
list($fieldLabel, $fieldData) = preg_split("/" . $dataDelimiter . "/", $singleField);
if (isset($tagsToRefbaseFieldsArray[$fieldLabel])) // if the current tag is one we'd like to import
{
$fieldData = preg_replace("/\s{2,}/", " ", $fieldData); // remove any hard returns and extra spaces within the data string
$fieldData = trim($fieldData); // remove any preceeding and trailing whitespace from the field data
// if all of the field data is in uppercase letters, we attempt to convert the string to something more readable:
// NOTE: while case transformation is also done in function 'standardizeFieldData()', we cannot omit it here
// since tags that can occur multiple times must be treated individually (i.e. before merging them)
if ($transformCase AND ($tagsToRefbaseFieldsArray[$fieldLabel] != "type")) // we exclude reference types from any case transformations
// TODO: we should probably only use Unicode-aware expressions here (i.e. something like "/^([$upper$digit]|[^$word])+$/$patternModifiers")
if (preg_match("/^[$upper\W\d]+$/$patternModifiers", $fieldData))
// convert upper case to title case (converts e.g. "ELSEVIER SCIENCE BV" into "Elsevier Science Bv"):
// (note that this case transformation won't do the right thing for author initials and abbreviations,
// but the result is better than the whole string being upper case, IMHO)
$fieldData = changeCase('title', $fieldData); // function 'changeCase()' is defined in 'include.inc.php'
// extract individual items of tags that can occur multiple times:
foreach ($tagsMultipleArray as $tagMultiple)
{
if (preg_match("/^" . $tagMultiple . "$/i", $fieldLabel))
{
if(!is_array($tagsToRefbaseFieldsArray[$fieldLabel]))
{
$tagContentsMultipleArray[$tagsToRefbaseFieldsArray[$fieldLabel]][] = $fieldData;
}
else // if the current tag's value in '$tagsToRefbaseFieldsArray' is an array...
{
// ...we'll copy field data to different refbase fields depending on the current records reference type:
// NOTE: this will only work if the array element that maps to 'type' has been already parsed,
// which is why '$tagsToRefbaseFieldsArray' should contain this as the first element!
$useDefault = true;
foreach ($tagsToRefbaseFieldsArray[$fieldLabel] as $referenceType => $refbaseField)
{
if ($fieldParametersArray['type'] == $referenceType)
{
$tagContentsMultipleArray[$refbaseField][] = $fieldData;
$useDefault = false;
break;
}
}
if ($useDefault AND isset($tagsToRefbaseFieldsArray[$fieldLabel]['Other']))
$tagContentsMultipleArray[$tagsToRefbaseFieldsArray[$fieldLabel]['Other']][] = $fieldData;
}
}
}
// copy field data to array of field parameters (using the corresponding refbase field name as element key):
if(!is_array($tagsToRefbaseFieldsArray[$fieldLabel]))
{
$fieldParametersArray[$tagsToRefbaseFieldsArray[$fieldLabel]] = $fieldData;
}
else // if the current tag's value in '$tagsToRefbaseFieldsArray' is an array...
{
// ...we'll copy field data to different refbase fields depending on the current records reference type:
// (see also above note about '$tagsToRefbaseFieldsArray' requiring 'type' as the first element)
$useDefault = true;
foreach ($tagsToRefbaseFieldsArray[$fieldLabel] as $referenceType => $refbaseField)
{
if ($fieldParametersArray['type'] == $referenceType)
{
$fieldParametersArray[$refbaseField] = $fieldData;
$useDefault = false;
break;
}
}
if ($useDefault AND isset($tagsToRefbaseFieldsArray[$fieldLabel]['Other']))
$fieldParametersArray[$tagsToRefbaseFieldsArray[$fieldLabel]['Other']] = $fieldData;
}
}
}
// (END LOOP OVER EACH FIELD)
// POST-PROCESS FIELD DATA:
if (empty($showSource) AND isset($fieldParametersArray['source'])) // if we're NOT supposed to display the original source data
unset($fieldParametersArray['source']); // remove the special 'source' field from the array of fields
// merge individual items of fields that can occur multiple times:
foreach ($tagsMultipleArray as $tagMultiple)
{
if(!is_array($tagsToRefbaseFieldsArray[$tagMultiple]))
{
if (isset($tagContentsMultipleArray[$tagsToRefbaseFieldsArray[$tagMultiple]]))
$fieldParametersArray[$tagsToRefbaseFieldsArray[$tagMultiple]] = implode("; ", $tagContentsMultipleArray[$tagsToRefbaseFieldsArray[$tagMultiple]]);
}
else // if the current tag's value in '$tagsToRefbaseFieldsArray' is an array...
{
// ...we'll copy field data to different refbase fields depending on the current records reference type:
// (see also above note about '$tagsToRefbaseFieldsArray' requiring 'type' as the first element)
$useDefault = true;
foreach ($tagsToRefbaseFieldsArray[$tagMultiple] as $referenceType => $refbaseField)
{
if ($fieldParametersArray['type'] == $referenceType)
{
if (isset($tagContentsMultipleArray[$refbaseField]))
{
$fieldParametersArray[$refbaseField] = implode("; ", $tagContentsMultipleArray[$refbaseField]);
$useDefault = false;
break;
}
}
}
if ($useDefault AND isset($tagsToRefbaseFieldsArray[$tagMultiple]['Other']))
if (isset($tagContentsMultipleArray[$tagsToRefbaseFieldsArray[$tagMultiple]['Other']]))
$fieldParametersArray[$tagsToRefbaseFieldsArray[$tagMultiple]['Other']] = implode("; ", $tagContentsMultipleArray[$tagsToRefbaseFieldsArray[$tagMultiple]['Other']]);
}
}
// convert format-specific reference types into refbase format:
// (e.g. for the RIS format, convert "JOUR" into "Journal Article", etc)
if (isset($fieldParametersArray['type']))
$fieldParametersArray['type'] = searchReplaceText($referenceTypesToRefbaseTypesArray, $fieldParametersArray['type'], false); // function 'searchReplaceText()' is defined in 'include.inc.php'
// standardize field data contained in '$fieldParametersArray':
// (function 'standardizeFieldData()' e.g. performs case transformation, standardizes thesis names, normalizes page ranges, and reformats person names according to preference)
$fieldParametersArray = standardizeFieldData($fieldParametersArray, $recordFormat, $personDelimiter, $familyNameGivenNameDelimiter, $familyNameFirst, $shortenGivenNames, $transformCase, $postprocessorActionsArray);
// append the array of extracted field data to the main data array which holds all records to import:
$parsedRecordsArray[] = $fieldParametersArray;
}
}
// (END LOOP OVER EACH RECORD)
return array($parsedRecordsArray, $recordsCount);
}
// --------------------------------------------------------------------
// STANDARDIZE FIELD DATA
// This function standardizes field data contained in '$fieldParametersArray':
// (e.g. performs case transformation, standardizes thesis names, normalizes page ranges, and reformats person names according to preference)
function standardizeFieldData($fieldParametersArray, $recordFormat, $personDelimiter, $familyNameGivenNameDelimiter, $familyNameFirst, $shortenGivenNames, $transformCase, $postprocessorActionsArray)
{
global $alnum, $alpha, $cntrl, $dash, $digit, $graph, $lower, $print, $punct, $space, $upper, $word, $patternModifiers; // defined in 'transtab_unicode_charset.inc.php' and 'transtab_latin1_charset.inc.php'
if (!empty($fieldParametersArray))
{
// perform case transformation:
// NOTE: this case transformation is kinda redundant if the record data were passed thru function 'parseRecords()' before,
// but we include it here since this function is also called individually (e.g. by function 'crossrefToRefbase()')
foreach ($fieldParametersArray as $fieldKey => $fieldData) // for each field within the current record...
{
// if all of the field data is in uppercase letters, we attempt to convert the string to something more readable:
if ($transformCase AND (!preg_match("/^(type|issn|url|doi)$/", $fieldKey))) // we exclude ISSN & DOI numbers, as well as URLs and reference types from any case transformations
// TODO: as above, we should probably only use Unicode-aware expressions here (i.e. something like "/^([$upper$digit]|[^$word])+$/$patternModifiers")
if (preg_match("/^[$upper\W\d]+$/$patternModifiers", $fieldData))
// convert upper case to title case (converts e.g. "ELSEVIER SCIENCE BV" into "Elsevier Science Bv"):
// (note that this case transformation won't do the right thing for author initials and abbreviations,
// but the result is better than the whole string being upper case, IMHO)
$fieldParametersArray[$fieldKey] = changeCase('title', $fieldData); // function 'changeCase()' is defined in 'include.inc.php'
}
if (preg_match("/Thesis/", $fieldParametersArray['type']))
{
$fieldParametersArray['type'] = "Book Whole";
// standardize thesis names:
if (isset($fieldParametersArray['thesis']))
{
if (preg_match("/^Master'?s?( thesis)?$/i", $fieldParametersArray['thesis']))
$fieldParametersArray['thesis'] = "Master's thesis";
elseif (preg_match("/^Bachelor'?s?( thesis)?$/i", $fieldParametersArray['thesis']))
$fieldParametersArray['thesis'] = "Bachelor's thesis";
elseif (preg_match("/^(Diploma( thesis)?|Dipl(om)?(arbeit)?)$/i", $fieldParametersArray['thesis']))
$fieldParametersArray['thesis'] = "Diploma thesis";
elseif (preg_match("/^(Doctoral( thesis)?|Diss(ertation)?|Doktor(arbeit)?)$/i", $fieldParametersArray['thesis']))
$fieldParametersArray['thesis'] = "Doctoral thesis";
elseif (preg_match("/^Habil(itation)?( thesis)?$/i", $fieldParametersArray['thesis']))
$fieldParametersArray['thesis'] = "Habilitation thesis";
else // if an unknown thesis name was given
$fieldParametersArray['thesis'] = "Ph.D. thesis"; // NOTE: this fallback may actually be not correct!
}
else // if no thesis info was given
$fieldParametersArray['thesis'] = "Ph.D. thesis"; // NOTE: this fallback may actually be not correct!
}
// merge contents of the special fields 'startPage' and 'endPage' into a range and copy it to the 'pages' field:
// (these special fields will be then removed again from the '$fieldParametersArray' since they aren't valid refbase field names)
if (isset($fieldParametersArray['startPage']) OR isset($fieldParametersArray['endPage']))
{
$pages = array();
if (isset($fieldParametersArray['startPage']))
{
if (!empty($fieldParametersArray['startPage']))
$pages[] = $fieldParametersArray['startPage'];
unset($fieldParametersArray['startPage']);
}
if (isset($fieldParametersArray['endPage']))
{
if (!empty($fieldParametersArray['endPage']))
$pages[] = $fieldParametersArray['endPage'];
unset($fieldParametersArray['endPage']);
}
if (!empty($pages))
$fieldParametersArray['pages'] = implode("-", $pages);
if (preg_match("/Book Whole/", $fieldParametersArray['type']) AND preg_match("/^\d+$/", $fieldParametersArray['pages']))
$fieldParametersArray['pages'] = $fieldParametersArray['pages'] . " pp"; // append "pp" identifier for whole books where the pages field contains a single number
}
// if the 'pages' field contains a page range, verify that the end page is actually greater than the start page:
// TODO: - make regex patterns Unicode-aware (e.g. use '$punct' instead of '-')
// - can this be standardized with function 'formatPageInfo()' in 'cite.inc.php'?
if (isset($fieldParametersArray['pages']) AND preg_match("/^\d+\D*-\D*\d+$/", $fieldParametersArray['pages']))
{
list($startPage, $endPage) = preg_split("/\D*-\D*/", $fieldParametersArray['pages']);
$countStartPage = strlen($startPage);
$countEndPage = strlen($endPage);
if(($countStartPage > $countEndPage) AND ($startPage > $endPage))
{
$startPagePart = preg_replace("/^.*?(\d{" . $countEndPage . "})$/", "\\1", $startPage);
if ($startPagePart < $endPage)
$fieldParametersArray['pages'] = $startPage . "-" . ($startPage + ($endPage - $startPagePart)); // convert page ranges such as '673-6' or '673-85' to '673-676' or '673-685', respectively
}
}
// standardize contents of the 'author', 'editor' and 'series_editor' fields:
if (!empty($fieldParametersArray['author']) OR !empty($fieldParametersArray['editor']) OR !empty($fieldParametersArray['series_editor']))
{
$namesArray = array();
if (!empty($fieldParametersArray['author']))
$namesArray['author'] = $fieldParametersArray['author'];
if (!empty($fieldParametersArray['editor']))
$namesArray['editor'] = $fieldParametersArray['editor'];
if (!empty($fieldParametersArray['series_editor']))
$namesArray['series_editor'] = $fieldParametersArray['series_editor'];
if (!empty($namesArray))
foreach ($namesArray as $nameKey => $nameString)
$fieldParametersArray[$nameKey] = standardizePersonNames($nameString, $familyNameFirst, $personDelimiter, $familyNameGivenNameDelimiter, $shortenGivenNames);
}
// if the 'author' field is empty BUT the 'editor' field is not empty AND the record type is either a container item or a self-contained/independent item (such as 'Book Whole', 'Journal', 'Manuscript' or 'Map'):
if (empty($fieldParametersArray['author']) AND !empty($fieldParametersArray['editor']) AND preg_match("/^(Book Whole|Conference Volume|Journal|Manual|Manuscript|Map|Miscellaneous|Patent|Report|Software)$/", $fieldParametersArray['type']))
{
$fieldParametersArray['author'] = $fieldParametersArray['editor']; // duplicate field contents from 'editor' to 'author' field
if (!preg_match("/;/", $fieldParametersArray['author'])) // if the 'author' field does NOT contain a ';' (which would delimit multiple authors) => single author
$fieldParametersArray['author'] .= " (ed)"; // append " (ed)" to the end of the 'author' string
else // the 'author' field does contain at least one ';' => multiple authors
$fieldParametersArray['author'] .= " (eds)"; // append " (eds)" to the end of the 'author' string
}
// if some (full or abbreviated) series title was given, we assume that the information given in 'volume'/'issue' is actually the 'series_volume'/'series_issue':
if (!empty($fieldParametersArray['series_title']) OR !empty($fieldParametersArray['abbrev_series_title']))
{
if (!empty($fieldParametersArray['volume']) AND empty($fieldParametersArray['series_volume'])) // move 'volume' to 'series_volume'
{
$fieldParametersArray['series_volume'] = $fieldParametersArray['volume'];
unset($fieldParametersArray['volume']);
}
if (!empty($fieldParametersArray['issue']) AND empty($fieldParametersArray['series_issue'])) // move 'issue' to 'series_issue'
{
$fieldParametersArray['series_issue'] = $fieldParametersArray['issue'];
unset($fieldParametersArray['issue']);
}
}
// if the 'url' field actually contains a DOI prefixed with "http://dx.doi.org/" (AND the 'doi' field is empty), we'll extract the DOI and move it to the 'doi' field:
if (!empty($fieldParametersArray['url']) AND empty($fieldParametersArray['doi']) AND preg_match("#(?<=^|; )http://dx\.doi\.org/10\.\d{4}/\S+?(?=$|; )#", $fieldParametersArray['url']))
{
$fieldParametersArray['doi'] = preg_replace("#(?:.+?; )?http://dx\.doi\.org/(10\.\d{4}/\S+?)(?=$|; ).*#", "\\1", $fieldParametersArray['url']); // extract DOI to 'doi' field
$fieldParametersArray['url'] = preg_replace("#^http://dx\.doi\.org/10\.\d{4}/\S+?(?=$|; )(; )?#", "", $fieldParametersArray['url']); // remove DOI URL from beginning of 'url' field
$fieldParametersArray['url'] = preg_replace("#(; )?http://dx\.doi\.org/10\.\d{4}/\S+?(?=$|; )#", "", $fieldParametersArray['url']); // remove DOI URL from middle (or end) of 'url' field
if (empty($fieldParametersArray['url'])) // the DOI URL was the only URL given
unset($fieldParametersArray['url']);
}
if (!empty($fieldParametersArray['url'])) // besides any DOI URL, some other URL(s) were given
$fieldParametersArray['url'] = preg_replace("/^([^ ]+?)(?=$|; ).*/", "\\1", $fieldParametersArray['url']); // remove everything but the first URL from the 'url' field (currently, refbase does only support one URL per record)
// standardize format of ISSN number:
if (!empty($fieldParametersArray['issn']) AND preg_match("/^ *\d{4}\D*\d{4} *$/", $fieldParametersArray['issn']))
{
$fieldParametersArray['issn'] = preg_replace("/^ *(\d{4})\D*(\d{4}) *$/", "\\1-\\2", $fieldParametersArray['issn']);
}
// apply search & replace 'actions' to all fields that are listed in the 'fields' element of the arrays contained in '$postprocessorActionsArray':
foreach ($postprocessorActionsArray as $fieldActionsArray)
foreach ($fieldParametersArray as $fieldName => $fieldValue)
if (in_array($fieldName, $fieldActionsArray['fields']))
$fieldParametersArray[$fieldName] = searchReplaceText($fieldActionsArray['actions'], $fieldValue, true); // function 'searchReplaceText()' is defined in 'include.inc.php'
// if (except for a DOI) no other URL(s) are given AND the 'notes' field contains a PubMed ID, we extract the
// PubMed ID and copy a resolvable URL (that points to the PubMed article's abstract page) to the 'url' field:
if (!isset($fieldParametersArray['url']) AND isset($fieldParametersArray['notes']) AND preg_match("/PMID *: *\d+/i", $fieldParametersArray['notes']))
$fieldParametersArray['url'] = "http://www.ncbi.nlm.nih.gov/pubmed/" . preg_replace("/.*?PMID *: *(\d+).*/i", "\\1", $fieldParametersArray['notes']);
}
return $fieldParametersArray;
}
// --------------------------------------------------------------------
// STANDARDIZE PERSON NAMES
// This function is currently a wrapper for the 'reArrangeAuthorContents()' function that is used by several import routines.
// The function standardizes the contents of the 'author', 'editor' and 'series_editor' fields and features removal of
// extra whitespace, re-arranging of family and given names, abbreviation of given names, adding of dots between initials, etc.
function standardizePersonNames($nameString, $familyNameFirst, $personDelimiter, $familyNameGivenNameDelimiter, $shortenGivenNames)
{
// Call the 'reArrangeAuthorContents()' function (defined in 'include.inc.php') in order to re-order contents of the 'author', 'editor' or 'series_editor' field. Required Parameters:
// 1. input: contents of the author field
// 2. input: boolean value that specifies whether the author's family name comes first (within one author) in the source string
// ('true' means that the family name is followed by the given name (or initials), 'false' if it's the other way around)
//
// 3. input: pattern describing old delimiter that separates different authors
// 4. output: for all authors except the last author: new delimiter that separates different authors
// 5. output: for the last author: new delimiter that separates the last author from all other authors
//
// 6. input: pattern describing old delimiter that separates author name & initials (within one author)
// 7. output: for the first author: new delimiter that separates author name & initials (within one author)
// 8. output: for all authors except the first author: new delimiter that separates author name & initials (within one author)
// 9. output: new delimiter that separates multiple initials (within one author)
// 10. output: for the first author: boolean value that specifies if initials go *before* the author's name ['true'], or *after* the author's name ['false'] (which is the default in the db)
// 11. output: for all authors except the first author: boolean value that specifies if initials go *before* the author's name ['true'], or *after* the author's name ['false'] (which is the default in the db)
// 12. output: boolean value that specifies whether an author's full given name(s) shall be shortened to initial(s)
//
// 13. output: if the total number of authors is greater than the given number (integer >= 1), only the number of authors given in (14) will be included in the citation along with the string given in (15); keep empty if all authors shall be returned
// 14. output: number of authors (integer >= 1) that is included in the citation if the total number of authors is greater than the number given in (13); keep empty if not applicable
// 15. output: string that's appended to the number of authors given in (14) if the total number of authors is greater than the number given in (13); the actual number of authors can be printed by including '__NUMBER_OF_AUTHORS__' (without quotes) within the string
//
// 16. output: boolean value that specifies whether the re-ordered string shall be returned with higher ASCII chars HTML encoded
$reorderedNameString = reArrangeAuthorContents($nameString, // 1.
$familyNameFirst, // 2.
$personDelimiter, // 3.
"; ", // 4.
"; ", // 5.
$familyNameGivenNameDelimiter, // 6.
", ", // 7.
", ", // 8.
".", // 9.
false, // 10.
false, // 11.
$shortenGivenNames, // 12.
"", // 13.
"", // 14.
"", // 15.
false); // 16.
return $reorderedNameString;
}
// --------------------------------------------------------------------
// BUILD IMPORT ARRAY
// This function builds an array structure that can be passed to the 'addRecords()' function for import:
// (for a more detailed explanation of the required array structure, see the comments above the
// 'addRecords()' function in 'include.inc.php')
function buildImportArray($type, $version, $creator, $author, $contact, $options, $parsedRecordsArray)
{
$importDataArray = array();
$importDataArray['type'] = $type; // the array format of the 'records' element
$importDataArray['version'] = $version; // the version of the given array structure
$importDataArray['creator'] = $creator; // the name of the script/importer (preferably given as unique URI)
$importDataArray['author'] = $author; // author/contact name of the person who's responsible for this script/importer
$importDataArray['contact'] = $contact; // author's email/contact address
$importDataArray['options'] = $options; // array with settings that control the behaviour of the 'addRecords()' function
$importDataArray['records'] = $parsedRecordsArray; // array of record(s) (with each record being a sub-array of fields)
// NOTES:
// - the 'addRecords()' function will take care of the calculation fields ('first_author', 'author_count', 'first_page',
// 'volume_numeric' and 'series_volume_numeric')
//
// - similarly, the *date/*time/*by fields ('created_date', 'created_time', 'created_by', 'modified_date', 'modified_time' and
// 'modified_by') will be filled automatically if no custom values (in correct date ['YYYY-MM-DD'] and time ['HH:MM:SS'] format)
// are given in the '$importDataArray'
//
// - we could pass any custom info for the 'location' field with the '$importDataArray', omitting it here
// causes the 'addRecords()' function to insert name & email address of the currently logged-in user
// (e.g. 'Matthias Steffens (refbase@extracts.de)')
//
// - if the 'prefix_call_number' element of the 'options' array is set to "true", any 'call_number' string will be prefixed with
// the correct call number prefix of the currently logged-in user (e.g. 'IP� @ msteffens @ ')
//
// - the serial number(s) will be assigned automatically and returned by the 'addRecords()' function in form of an array
return $importDataArray;
}
// --------------------------------------------------------------------
// This function takes a BibTeX source and converts any contained
// LaTeX/BibTeX markup into proper refbase markup:
function standardizeBibtexInput($bibtexSourceText)
{
global $contentTypeCharset; // defined in 'ini.inc.php'
// The array '$transtab_bibtex_refbase' contains search & replace patterns for conversion from LaTeX/BibTeX markup & entities to refbase markup.
// Converts LaTeX fontshape markup (italic, bold) into appropriate refbase commands, super- and subscript as well as greek letters in math mode
// get converted into the respective refbase commands. You may need to adopt the LaTeX markup to suit your individual needs.
global $transtab_bibtex_refbase; // defined in 'transtab_bibtex_refbase.inc.php'
// The arrays '$transtab_latex_latin1' and '$transtab_latex_unicode' provide translation tables for best-effort conversion of higher ASCII
// characters from LaTeX markup to ISO-8859-1 entities (or Unicode, respectively).
global $transtab_latex_latin1; // defined in 'transtab_latex_latin1.inc.php'
global $transtab_latex_unicode; // defined in 'transtab_latex_unicode.inc.php'
// Perform search & replace actions on the given BibTeX text:
$bibtexSourceText = searchReplaceText($transtab_bibtex_refbase, $bibtexSourceText, true); // function 'searchReplaceText()' is defined in 'include.inc.php'
// Attempt to convert LaTeX markup for higher ASCII chars to their corresponding ISO-8859-1/Unicode entities:
if ($contentTypeCharset == "UTF-8")
$bibtexSourceText = searchReplaceText($transtab_latex_unicode, $bibtexSourceText, false);
else
$bibtexSourceText = searchReplaceText($transtab_latex_latin1, $bibtexSourceText, false);
return $bibtexSourceText;
}
// --------------------------------------------------------------------
// This function takes an Endnote XML source and converts any contained
// text style markup into proper refbase markup:
function standardizeEndnoteXMLInput($endxSourceText)
{
// The array '$transtab_endnotexml_refbase' contains search & replace patterns for conversion from Endnote XML text style markup to refbase markup.
// It attempts to convert fontshape markup (italic, bold) as well as super- and subscript into appropriate refbase markup.
global $transtab_endnotexml_refbase; // defined in 'transtab_endnotexml_refbase.inc.php'
// Perform search & replace actions on the given Endnote XML source text:
$endxSourceText = searchReplaceText($transtab_endnotexml_refbase, $endxSourceText, true); // function 'searchReplaceText()' is defined in 'include.inc.php'
return $endxSourceText;
}
// --------------------------------------------------------------------
// This function fetches source data from PubMed.gov for all PubMed IDs
// given in '$pmidArray':
// ('$sourceFormat' must be either "Pubmed Medline" or "Pubmed XML";
// more info on the Entrez Programming Utilities:
// <http://eutils.ncbi.nlm.nih.gov/entrez/query/static/eutils_help.html>)
function fetchDataFromPubMed($pmidArray, $sourceFormat = "Pubmed Medline")
{
global $errors;
$sourceText = "";
if (!empty($pmidArray))
{
// Remove any duplicate PubMed IDs:
$pmidArray = array_unique($pmidArray);
// Define response format:
if (preg_match("/^Pubmed XML$/i", $sourceFormat))
$fetchType = "xml";
else // by default, we'll use the "Pubmed Medline" format
$fetchType = "text";
// NOTE:
// When querying PubMed for multiple PubMed IDs *at once*, errors are not
// returned inline on a per-record basis. If one or more of the given
// PubMed IDs are invalid, PubMed returns a single error message, if the
// first given PubMed ID is invalid, otherwise it returns records until
// the first invalid ID is encountered. In any case, the remaining records
// seem to get omitted from the PubMed response.
// To work around this, we'll query PubMed for each given PubMed ID
// *individually* (similar to function 'fetchDataFromCrossRef()'), and we
// then perform the record validation (i.e. error checking) in function
// 'validateRecords()'.
// See below for alternative code that fetches PubMed records via a single
// HTTP request.
foreach ($pmidArray as $pmid)
{
// Build query URL:
$sourceURL = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
. "?db=pubmed"
. "&retmode=" . $fetchType
. "&rettype=medline"
. "&tool=refbase"
. "&email=" . rawurlencode("info@refbase.net")
. "&id=" . $pmid;
// Perform query:
$sourceText .= fetchDataFromURL($sourceURL); // function 'fetchDataFromURL()' is defined in 'include.inc.php'
}
// Alternative code that fetches PubMed records via a single HTTP request:
// (while this may be more efficient, it prevents us from checking errors
// on a per-record level -- see note above)
// // Merge PubMed IDs with commas:
// $sourceIDs = implode(",", $pmidArray);
//
// // Build query URL:
// $sourceURL = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
// . "?db=pubmed"
// . "&retmode=" . $fetchType
// . "&rettype=medline"
// . "&tool=refbase"
// . "&email=" . rawurlencode("info@refbase.net")
// . "&id=" . $sourceIDs;
//
// // Perform query:
// $sourceText = fetchDataFromURL($sourceURL);
//
// // Handle errors:
// if (!preg_match("/^PMID- /m", $sourceText) AND preg_match("/Error occurred:/", $sourceText)) // a PubMed error occurred, probably because only unrecognized PubMed IDs were given; TODO: handle PubMed XML
// $errors["sourceText"] = preg_replace("/.*Error occurred: *([^<>]+).*/s", "PubMed error: \\1", $sourceText); // attempt to extract PubMed error message
}
return array($errors, $sourceText);
}
// --------------------------------------------------------------------
// This function fetches record metadata from arXiv.org for all arXiv IDs
// given in '$itemArray':
// (for '$sourceFormat', only "arXiv XML", i.e. the arXiv.org Atom XML OpenSearch format,
// is currently supported; more info on the arXiv API:
// <http://export.arxiv.org/api_help/>
// <http://export.arxiv.org/api_help/docs/user-manual.html>
//
// Requires the SimplePie library (by Ryan Parman and Geoffrey Sneddon), which is
// available under the BSD license from: <http://simplepie.org>
function fetchDataFromArXiv($itemArray, $sourceFormat = "arXiv XML")
{
global $errors; // NOTE: ATM, error checking is done in function 'arxivToRefbase()'
$sourceURLArray = array();
if (!empty($itemArray))
{
// Remove any duplicate IDs:
$itemArray = array_unique($itemArray);
// NOTE:
// When querying arXiv.org for multiple arXiv IDs *at once*, errors are not
// returned inline on a per-record basis. If one or more of the given
// arXiv IDs are invalid, arXiv.org returns a *single* error message, and
// any other requested records seem to get omitted from the arXiv response.
// To work around this, we'll query arXiv.org for each given arXiv ID
// *individually*, and we then perform the record validation (i.e. error
// checking) in function 'arxivToRefbase()'.
foreach ($itemArray as $item)
{
// if (preg_match("#(arXiv:|http://arxiv\.org/abs/)?([\w.-]+/\d{7}|\d{4}\.\d{4,})(v\d+)?#i", $item)) // '$item' is an arXiv ID
// {
// Build query URL:
$sourceURLArray[] = "http://export.arxiv.org/api/query"
. "?id_list=" . rawurlencode($item);
// }
}
// Perform query:
$feed = new SimplePie(); // setup new SimplePie constructor
$feed->set_feed_url($sourceURLArray); // setup multi-feed request
$feed->set_input_encoding('UTF-8'); // force UTF-8 as input encoding
$feed->enable_cache(false); // disable caching
$feed->enable_order_by_date(false); // disable automatic sorting of entries by date
$feed->init(); // process options, fetch feeds, cache, parse, merge, etc
}
return array($errors, $feed);
}
// --------------------------------------------------------------------
// This function tries to fetch PubMed IDs from PubMed.gov for all DOIs given
// in '$doiArray':
//
// NOTE: The function 'SimpleXMLElement()' requires the SimpleXML extension which,
// in turn, requires PHP 5 compiled with the --enable-libxml option.
//
// Author: Nicholaus Lance Hepler <mailto:nhelper@gmail.com>
function fetchDOIsFromPubMed($doiArray, $sourceFormat = "CrossRef XML")
{
global $errors;
$sourceText = "";
$pmidArray = array();
$failedIDs = array();
if (!empty($doiArray))
{
// Remove any duplicate IDs:
$doiArray = array_unique($doiArray);
foreach ($doiArray as $doi)
{
// Build query URL:
$sourceURL = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
. "?db=pubmed"
. "&retmax=1"
. "&field=doi"
. "&term=" . $doi;
// Perform query:
$esearchText = fetchDataFromURL($sourceURL);
$xml = new SimpleXMLElement($esearchText); // requires PHP 5 with --enable-libxml
if ($xml->Count != 1 || (isset($xml->ErrorList->PhraseNotFound) && !empty($xml->ErrorList->PhraseNotFound)))
{
$failedIDs[] = $doi;
}
else
{
// Extract PubMed ID:
$pmidArray[] = $xml->IdList->Id[0];
}
}
}
if (!empty($failedIDs))
{
$failedIDs = array_merge($failedIDs, $pmidArray);
}
else
{
// Fetch source data from PubMed.gov for all found PubMed IDs:
list($errors, $sourceText) = fetchDataFromPubMed($pmidArray);
}
return array($errors, $sourceText, $failedIDs);
}
// --------------------------------------------------------------------
// This function tries to fetch record metadata from CrossRef.org for all DOIs or
// OpenURLs given in '$itemArray':
// (for '$sourceFormat', only "CrossRef XML", i.e. the CrossRef "unixref XML" format,
// is currently supported; more info on the CrossRef OpenURL resolver/metadata server:
// <http://www.crossref.org/openurl>
// see also: <http://hublog.hubmed.org/archives/001624.html>)
function fetchDataFromCrossRef($itemArray, $sourceFormat = "CrossRef XML")
{
global $errors;
global $crossRefReqDat;
$sourceText = "";
if (!empty($itemArray))
{
// Remove any duplicate IDs:
$itemArray = array_unique($itemArray);
// Define response format:
// if (preg_match("/^CrossRef XML$/i", $sourceFormat))
// $fetchType = "unixref";
// else // by default, we'll use the "unixref XML" format
$fetchType = "unixref";
foreach ($itemArray as $item)
{
// Build query URL:
$sourceURL = "http://www.crossref.org/openurl/"
. "?noredirect=true"
. "&format=" . $fetchType;
if (!empty($crossRefReqDat))
$sourceURL .= "&pid=" . rawurlencode($crossRefReqDat);
if (preg_match("#^10\.\d{4}/\S+$#", $item)) // '$item' is a DOI
$sourceURL .= "&id=" . rawurlencode("doi:" . $item);
else // otherwise we assume a full OpenURL context object // TODO: verify OpenURL!?
$sourceURL .= "&" . $item;
// Perform query:
$sourceText .= fetchDataFromURL($sourceURL); // function 'fetchDataFromURL()' is defined in 'include.inc.php'
}
}
return array($errors, $sourceText);
}
// --------------------------------------------------------------------
// CSA TO REFBASE
// This function converts records from Cambridge Scientific Abstracts (CSA) into the standard "refbase"
// array format which can be then imported by the 'addRecords()' function in 'include.inc.php'.
function csaToRefbase($sourceText, $importRecordsRadio, $importRecordNumbersArray)
{
global $alnum, $alpha, $cntrl, $dash, $digit, $graph, $lower, $print, $punct, $space, $upper, $word, $patternModifiers; // defined in 'transtab_unicode_charset.inc.php' and 'transtab_latin1_charset.inc.php'
global $errors;
global $showSource;
// Defines the pattern by which the input text will be split into individual records:
$recordDelimiter = "\s*Record \d+ of \d+\s*";
// PRE-PROCESS SOURCE TEXT:
// Split input text into individual records:
$recordArray = splitSourceText($sourceText, $recordDelimiter, false); // split input text on the header text preceeding each CSA record (e.g. "\nRecord 4 of 52\n")
// Count how many records are available:
$recordsCount = count($recordArray);
// ----------------------------------------------------------------
// VALIDATE INDIVIDUAL RECORDS:
// Note that source data must begin with "\nRecord x of xx\n" and that (opposed to the handling in 'import_csa_modify.php') any text preceeding the source data isn't removed but treated as the first record!
// This array lists patterns which match all CSA tags that must occur within a record to be recognized as valid CSA record:
// (Array keys must contain the tag name as it should be displayed to the user; as is the case with search & replace actions,
// the search patterns MUST include the leading & trailing slashes.)
// "tag display name" => "tag search pattern"
$requiredTagsArray = array(
"title" => "/^TI: Title *[\r\n]+ {4,4}/m",
"author (or editor)" => "/^(AU: Author|ED: Editor) *[\r\n]+ {4,4}/m",
"source" => "/^SO: Source *[\r\n]+ {4,4}/m" // since the "SO: Source" is also specified as format requirement in function 'identifySourceFormat()' records without "SO: Source" won't be recognized anyhow
);
// Validate all records that shall be imported:
list($errors, $importRecordNumbersRecognizedFormatArray, $importRecordNumbersNotRecognizedFormatArray) = validateRecords($recordArray, $requiredTagsArray, $importRecordsRadio, $importRecordNumbersArray, $errors);
// ----------------------------------------------------------------
// PROCESS SOURCE DATA:
$parsedRecordsArray = array(); // initialize array variable which will hold parsed data of all records that shall be imported
// LOOP OVER EACH RECORD:
for ($i=0; $i<$recordsCount; $i++) // for each record...
{
// if we're NOT supposed to import this record (because it was either not selected by the user -OR- because it did contain an unrecognized data format)
if (!in_array(($i+1), $importRecordNumbersRecognizedFormatArray)) // '$i' starts with 0 so we have to add 1 to point to the correct record number
{
continue; // process next record (if any)
}
else // ...import the current record:
{
$singleRecord = $recordArray[$i];
// if the "AU: Author" field is missing BUT the "ED: Editor" is present (which is allowed for book monographs):
// we replace the "ED: Editor" field identifier with "AU: Author" (this will keep any " (ed)" and " (eds)" tags in place which, in turn, will cause the "is Editor" checkbox in 'record.php' to get marked)
if (!preg_match("/^AU: Author *[\r\n]+ {4,4}/m", $singleRecord) AND preg_match("/^ED: Editor *[\r\n]+ {4,4}/m", $singleRecord) AND preg_match("/^(PT: Publication Type\s+Book Monograph|DT: Document Type\s+B)/m", $singleRecord))
$singleRecord = preg_replace("/^ED: Editor(?= *[\r\n]+ {4,4})/m", "AU: Author", $singleRecord);
// split each record into its fields:
$fieldArray = preg_split("/[\r\n]+(?=\w\w: )/", $singleRecord);
// initialize some variables:
$fieldParametersArray = array(); // setup an empty array (it will hold all fields that were extracted for a given record)
$additionalDocumentTypeInfo = ""; // will be used with the "PT: Publication Type" field
$environmentalRegime = ""; // will be used with the "ER: Environmental Regime" field
// GENERATE EXTRA FIELDS:
// check if the fields "MT: Monograph Title", "JN: Journal Name", "JV: Journal Volume", "JI: Journal Issue" and "JP: Journal Pages" are present,
// if not, we attempt to generate them from the "SO: Source" field:
$sourceField = preg_replace("/.*SO: Source *[\r\n]+ {4,4}(.+?)(?=([\r\n]+\w\w: |\s*\z)).*/ms", "\\1", $singleRecord); // first, we need to extract the "SO: Source" field data from the record text
$sourceField = preg_replace("/\s{2,}/", " ", $sourceField); // remove any hard returns and extra spaces within the source field data string
// if the current record is of type "Book Monograph" but the field "MT: Monograph Title" is missing:
if (preg_match("/^(PT: Publication Type\s+Book Monograph|DT: Document Type\s+B)/m", $singleRecord) AND !preg_match("/^MT: Monograph Title *[\r\n]+ {4,4}/m", $singleRecord))
{
$extractedSourceFieldData = preg_replace("/^([^.[]+).*/", "\\1", $sourceField); // attempt to extract the full monograph title from the source field
if (preg_match("/^[$upper\W\d]+$/$patternModifiers", $extractedSourceFieldData)) // if all of the words within the monograph title are uppercase, we attempt to convert the string to something more readable:
// perform case transformation (e.g. convert "BIOLOGY AND ECOLOGY OF GLACIAL RELICT CRUSTACEA" into "Biology And Ecology Of Glacial Relict Crustacea")
$extractedSourceFieldData = changeCase('title', $extractedSourceFieldData); // function 'changeCase()' is defined in 'include.inc.php'
$fieldArray[] = "MT: Monograph Title\r\n " . $extractedSourceFieldData; // add field "MT: Monograph Title" to the array of fields
}
// else if the current record is of type "Journal Article", "Report", etc (or wasn't specified) but the field "JN: Journal Name" is missing:
elseif (!preg_match("/^JN: Journal Name *[\r\n]+ {4,4}/m", $singleRecord)) // preg_match("/^(PT: Publication Type\s+(Journal Article|Report)|DT: Document Type\s+(J|R))/m", $singleRecord)
{
if (preg_match("/\[/", $sourceField)) // if the source field data contain a square bracket we assume a format like: "Journal of Phycology [J. Phycol.]. Vol. 37, no. s3, pp. 18-18. Jun 2001."
$extractedSourceFieldData = preg_replace("/^([^.[]+).*/", "\\1", $sourceField); // attempt to extract the full journal name from the source field
else // source field format might be something like: "Phycologia, vol. 34, no. 2, pp. 135-144, 1995"
$extractedSourceFieldData = preg_replace("/^([^.,]+).*/", "\\1", $sourceField); // attempt to extract the full journal name from the source field
if (preg_match("/^[$upper\W\d]+$/$patternModifiers", $extractedSourceFieldData)) // if all of the words within the journal name are uppercase, we attempt to convert the string to something more readable:
// perform case transformation (e.g. convert "POLAR BIOLOGY" into "Polar Biology")
$extractedSourceFieldData = changeCase('title', $extractedSourceFieldData);
$fieldArray[] = "JN: Journal Name\r\n " . $extractedSourceFieldData; // add field "JN: Journal Name" to the array of fields
}
// if the "JV: Journal Volume" is missing BUT the "SO: Source" field contains a volume specification:
if (!preg_match("/^JV: Journal Volume *[\r\n]+ {4,4}/m", $singleRecord) AND preg_match("/(?<=\W)vol[. ]+[\w\/-]+/i", $sourceField))
{
$extractedSourceFieldData = preg_replace("/.*(?<=\W)vol[. ]+([\w\/-]+).*/i", "\\1", $sourceField); // attempt to extract the journal volume from the source field
$fieldArray[] = "JV: Journal Volume\r\n " . $extractedSourceFieldData; // add field "JV: Journal Volume" to the array of fields
}
// if the "JI: Journal Issue" is missing BUT the "SO: Source" field contains an issue specification:
if (!preg_match("/^JI: Journal Issue *[\r\n]+ {4,4}/m", $singleRecord) AND preg_match("/(?<=\W)no[. ]+[\w\/-]+/i", $sourceField))
{
$extractedSourceFieldData = preg_replace("/.*(?<=\W)no[. ]+([\w\/-]+).*/i", "\\1", $sourceField); // attempt to extract the journal issue from the source field
$fieldArray[] = "JI: Journal Issue\r\n " . $extractedSourceFieldData; // add field "JI: Journal Issue" to the array of fields
}
// if the "JP: Journal Pages" is missing BUT the "SO: Source" field contains a pages specification:
if (!preg_match("/^JP: Journal Pages *[\r\n]+ {4,4}/m", $singleRecord) AND preg_match("/((?<=\W)pp?[. ]+[\w\/,-]+|[\d,]+ *pp\b)/i", $sourceField))
{
if (preg_match("/(?<=\W)pp?[. ]+[\w\/,-]+/i", $sourceField)) // e.g. "pp. 212-217" or "p. 216" etc
$extractedSourceFieldData = preg_replace("/.*(?<=\W)pp?[. ]+([\w\/,-]+).*/i", "\\1", $sourceField); // attempt to extract the journal pages from the source field
elseif (preg_match("/[\d,]+ *pp\b/", $sourceField)) // e.g. "452 pp"
$extractedSourceFieldData = preg_replace("/.*?([\d,]+ *pp)\b.*/i", "\\1", $sourceField); // attempt to extract the journal pages from the source field
$extractedSourceFieldData = preg_replace("/,/", "", $extractedSourceFieldData); // remove any thousands separators from journal pages
$fieldArray[] = "JP: Journal Pages\r\n " . $extractedSourceFieldData; // add field "JP: Journal Pages" to the array of fields
}
// Additionally, we extract the abbreviated journal name from the "SO: Source" field (if available):
if (preg_match("/\[/", $sourceField)) // if the source field data contain a square bracket we assume a format like: "Journal of Phycology [J. Phycol.]. Vol. 37, no. s3, pp. 18-18. Jun 2001."
{
$extractedSourceFieldData = preg_replace("/.*\[(.+?)\].*/", "\\1", $sourceField); // attempt to extract the abbreviated journal name from the source field
$extractedSourceFieldData = preg_replace("/\./", "", $extractedSourceFieldData); // remove any dots from the abbreviated journal name
if (preg_match("/^[$upper\W\d]+$/$patternModifiers", $extractedSourceFieldData)) // if all of the words within the abbreviated journal name are uppercase, we attempt to convert the string to something more readable:
// perform case transformation (e.g. convert "BALT SEA ENVIRON PROC" into "Balt Sea Environ Proc")
$extractedSourceFieldData = changeCase('title', $extractedSourceFieldData);
$fieldArray[] = "JA: Abbrev Journal Name\r\n " . $extractedSourceFieldData; // add field "JA: Abbrev Journal Name" to the array of fields (note that this field normally does NOT occur within the CSA full record format!)
}
// (END GENERATE EXTRA FIELDS)
// LOOP OVER EACH FIELD:
foreach ($fieldArray as $singleField) // for each field within the current record...
{
$singleField = preg_replace("/^(\w\w: [^\r\n]+)[\r\n]+ {4,4}/", "\\1___LabelDataSplitter___", $singleField); // insert a unique text string between the field identifier and the field data
$fieldLabelPlusDataArray = preg_split("/___LabelDataSplitter___/", $singleField); // split each field into a 2-element array containing [0] the field identifier and [1] the field data
$fieldLabel = $fieldLabelPlusDataArray[0];
$fieldData = $fieldLabelPlusDataArray[1];
$fieldData = preg_replace("/\s{2,}/", " ", $fieldData); // remove any hard returns and extra spaces within the data string
$fieldData = trim($fieldData); // remove any preceeding and trailing whitespace from the field data
if (preg_match("/AU: Author/", $fieldLabel))
{
$fieldData = preg_replace("/\*/", "", $fieldData); // remove any asterisk ("*")
$fieldData = standardizePersonNames($fieldData, true, " *; *", " *, *", true); // standardize person names
}
elseif (preg_match("/ED: Editor/", $fieldLabel))
{
$fieldData = preg_replace("/ \(eds?\)(?= *$| *;)/", "", $fieldData); // remove " (ed)" and/or " (eds)"
$fieldData = standardizePersonNames($fieldData, true, " *; *", " *, *", true); // standardize person names
}
elseif (preg_match("/TI: Title|AB: Abstract/", $fieldLabel))
{
if (preg_match("/TI: Title/", $fieldLabel))
{
$fieldData = preg_replace("/--/", "-", $fieldData); // remove en-dash markup
$fieldData = preg_replace("/ *\. *$/", "", $fieldData); // remove any dot from end of title
}
if (preg_match("/ su(b|per)\(.+?\)/", $fieldData))
$fieldData = preg_replace("/ (su(?:b|per))\((.+?)\)/", "[\\1:\\2]", $fieldData); // transform " sub(...)" & " super(...)" markup into "[sub:...]" & "[super:...]" markup
if (preg_match("/(?<= )mu /", $fieldData))
$fieldData = preg_replace("/(?<= )mu /", "[mu]", $fieldData); // transform "mu " markup into "[mu]" markup
}
// BUILD FIELD PARAMETERS:
// build an array of key/value pairs:
// "AU: Author":
if (preg_match("/AU: Author/", $fieldLabel))
$fieldParametersArray['author'] = $fieldData;
// "TI: Title":
elseif (preg_match("/TI: Title/", $fieldLabel))
$fieldParametersArray['title'] = $fieldData;
// "PT: Publication Type":
elseif (preg_match("/PT: Publication Type/", $fieldLabel)) // could also check for "DT: Document Type" (but DT was added only recently)
{
if (preg_match("/[;:,.]/", $fieldData)) // if the "PT: Publication Type" field contains a delimiter (e.g. like: "Journal Article; Conference")
{
$correctDocumentType = preg_replace("/(.+?)\s*[;:,.]\s*.*/", "\\1", $fieldData); // extract everything before this delimiter
$additionalDocumentTypeInfo = preg_replace("/.*?\s*[;:,.]\s*(.+)/", "\\1", $fieldData); // extract everything after this delimiter
$additionalDocumentTypeInfo = $additionalDocumentTypeInfo; // this info will be appended to any notes field data (see below)
}
else // we take the "PT: Publication Type" field contents as they are
$correctDocumentType = $fieldData;
// Note that for books the "PT: Publication Type" field will always start with "Book Monograph", no matter whether the referenced
// publication is a whole book or just a book chapter within that book! This is a design flaw within the CSA full record format.
// So we can only apply some "good guessing" whether the current record actually references a complete book or just a book chapter:
if (preg_match("/^(PT: Publication Type\s+Book Monograph|DT: Document Type\s+B)/m", $singleRecord)) // if the current record is of type "Book Monograph"
{
// and if the source field contains some page specification like "213 pp." (AND NOT something like "pp. 76-82" or "p. 216")...
if (preg_match("/[\d,]+ *pp\b/i", $sourceField) AND !preg_match("/(?<=\W)pp?[. ]+[\w\/,-]+/i", $sourceField))
$correctDocumentType = "Book Whole"; // ...we assume its a whole book
else
$correctDocumentType = "Book Chapter"; // ...otherwise we assume its a book chapter (which may NOT always be correct!)
}
$fieldParametersArray['type'] = $correctDocumentType;
}
// "PY: Publication Year":
elseif (preg_match("/PY: Publication Year/", $fieldLabel))
$fieldParametersArray['year'] = $fieldData;
// "JN: Journal Name":
elseif (preg_match("/JN: Journal Name/", $fieldLabel))
{
// if the current record is of type "Book Monograph" AND the field "JN: Journal Name" was given within the *original* record data (i.e., before adding stuff to it):
if (preg_match("/^(PT: Publication Type\s+Book Monograph|DT: Document Type\s+B)/m", $singleRecord) AND preg_match("/^JN: Journal Name *[\r\n]+ {4,4}/m", $singleRecord))
// for book monographs the publication title is given in "MT: Monograph Title"; if a "JN: Journal Name" was originally provided as well, we assume, it's the series title:
$fieldParametersArray['series_title'] = $fieldData;
else
$fieldParametersArray['publication'] = $fieldData;
}
// "JA: Abbrev Journal Name":
elseif (preg_match("/JA: Abbrev Journal Name/", $fieldLabel))
{
if (preg_match("/^(PT: Publication Type\s+Book Monograph|DT: Document Type\s+B)/m", $singleRecord)) // if the current record is of type "Book Monograph"
// for book monographs the publication title is given in "MT: Monograph Title"; if a "JA: Abbrev Journal Name" is provided as well, we assume, it's the abbreviated series title:
$fieldParametersArray['abbrev_series_title'] = $fieldData;
else
$fieldParametersArray['abbrev_journal'] = $fieldData;
}
// "MT: Monograph Title":
elseif (preg_match("/MT: Monograph Title/", $fieldLabel))
{
// if the source field contains some page specification like "213 pp." (AND NOT something like "pp. 76-82" or "p. 216")...
if (preg_match("/[\d,]+ *pp\b/i", $sourceField) AND !preg_match("/(?<=\W)pp?[. ]+[\w\/,-]+/i", $sourceField))
// ...we assume its a whole book (see above comment), in which case we assign the monograph title to the series title field:
$fieldParametersArray['series_title'] = $fieldData;
else
$fieldParametersArray['publication'] = $fieldData;
}
// "JV: Journal Volume":
elseif (preg_match("/JV: Journal Volume/", $fieldLabel))
{
if (preg_match("/^(PT: Publication Type\s+Book Monograph|DT: Document Type\s+B)/m", $singleRecord)) // if the current record is of type "Book Monograph"
// for book monographs, if there's a volume given, we assume, it's the series volume:
$fieldParametersArray['series_volume'] = $fieldData;
else
$fieldParametersArray['volume'] = $fieldData;
}
// "JI: Journal Issue":
elseif (preg_match("/JI: Journal Issue/", $fieldLabel))
{
if (preg_match("/^(PT: Publication Type\s+Book Monograph|DT: Document Type\s+B)/m", $singleRecord)) // if the current record is of type "Book Monograph"
// for book monographs, if there's an issue given, we assume, it's the series issue:
$fieldParametersArray['series_issue'] = $fieldData;
else
$fieldParametersArray['issue'] = $fieldData;
}
// "JP: Journal Pages":
elseif (preg_match("/JP: Journal Pages/", $fieldLabel))
$fieldParametersArray['pages'] = $fieldData;
// "AF: Affiliation" & "AF: Author Affilition":
elseif (preg_match("/AF: (Author )?Affilia?tion/", $fieldLabel))
$fieldParametersArray['address'] = $fieldData;
// "CA: Corporate Author":
elseif (preg_match("/CA: Corporate Author/", $fieldLabel))
$fieldParametersArray['corporate_author'] = $fieldData;
// "DE: Descriptors":
elseif (preg_match("/DE: Descriptors/", $fieldLabel)) // currently, the fields "KW: Keywords" and "ID: Identifiers" are ignored!
$fieldParametersArray['keywords'] = $fieldData;
// "AB: Abstract":
elseif (preg_match("/AB: Abstract/", $fieldLabel))
$fieldParametersArray['abstract'] = $fieldData;
// "PB: Publisher":
elseif (preg_match("/PB: Publisher/", $fieldLabel))
{
if (preg_match("/^[$upper\W\d]+$/$patternModifiers", $fieldData)) // if all of the words within the publisher name are uppercase, we attempt to convert the string to something more readable:
// perform case transformation (e.g. convert "ELSEVIER SCIENCE B.V." into "Elsevier Science B.V.")
$fieldData = changeCase('title', $fieldData);
$fieldParametersArray['publisher'] = $fieldData;
}
// "ED: Editor":
elseif (preg_match("/ED: Editor/", $fieldLabel))
$fieldParametersArray['editor'] = $fieldData;
// "LA: Language":
elseif (preg_match("/LA: Language/", $fieldLabel))
$fieldParametersArray['language'] = $fieldData;
// "SL: Summary Language":
elseif (preg_match("/SL: Summary Language/", $fieldLabel))
$fieldParametersArray['summary_language'] = $fieldData;
// "OT: Original Title":
elseif (preg_match("/OT: Original Title/", $fieldLabel))
$fieldParametersArray['orig_title'] = $fieldData;
// "IS: ISSN":
elseif (preg_match("/IS: ISSN/", $fieldLabel))
$fieldParametersArray['issn'] = $fieldData;
// "IB: ISBN":
elseif (preg_match("/IB: ISBN/", $fieldLabel))
$fieldParametersArray['isbn'] = $fieldData;
// "ER: Environmental Regime":
elseif (preg_match("/ER: Environmental Regime/", $fieldLabel))
$environmentalRegime = $fieldData; // this info will be appended to any notes field data (see below)
// "CF: Conference":
elseif (preg_match("/CF: Conference/", $fieldLabel))
$fieldParametersArray['conference'] = $fieldData;
// "NT: Notes":
elseif (preg_match("/NT: Notes/", $fieldLabel))
$fieldParametersArray['notes'] = $fieldData;
// "DO: DOI":
elseif (preg_match("/DO: DOI/", $fieldLabel))
$fieldParametersArray['doi'] = $fieldData;
}
// (END LOOP OVER EACH FIELD)
if (!empty($showSource)) // if we're supposed to display the original source data
// append original source field data (they will be presented within the header message of 'record.php' for easy comparison with the extracted data):
$fieldParametersArray['source'] = $sourceField;
// we'll hack the "notes" element in order to append additional info:
// (this cannot be done earlier above since we don't know about the presence & order of fields within the source text!)
if (!empty($additionalDocumentTypeInfo)) // if the "PT: Publication Type" field contains some additional info
{
if (isset($fieldParametersArray['notes'])) // and if the notes element is present
$fieldParametersArray['notes'] = $fieldParametersArray['notes'] . "; " . $additionalDocumentTypeInfo; // append additional info from "PT: Publication Type" field
else // the notes parameter wasn't specified yet
$fieldParametersArray['notes'] = $additionalDocumentTypeInfo; // add notes element with additional info from "PT: Publication Type" field
}
if (!empty($environmentalRegime)) // if the "ER: Environmental Regime" field contains some data
{
if (isset($fieldParametersArray['notes'])) // and if the notes element is present
$fieldParametersArray['notes'] = $fieldParametersArray['notes'] . "; " . $environmentalRegime; // append "ER: Environmental Regime" field data
else // the notes parameter wasn't specified yet
$fieldParametersArray['notes'] = $environmentalRegime; // add notes element with "ER: Environmental Regime" field data
}
// Append the array of extracted field data to the main data array which holds all records to import:
$parsedRecordsArray[] = $fieldParametersArray;
}
}
// (END LOOP OVER EACH RECORD)
// ----------------------------------------------------------------
// BUILD REFBASE IMPORT ARRAY:
$importDataArray = buildImportArray("refbase", // 'type' - the array format of the 'records' element
"1.0", // 'version' - the version of the given array structure
"http://refbase.net/import/csa/", // 'creator' - the name of the script/importer (preferably given as unique URI)
"Matthias Steffens", // 'author' - author/contact name of the person who's responsible for this script/importer
"refbase@extracts.de", // 'contact' - author's email/contact address
array('prefix_call_number' => "true"), // 'options' - array with settings that control the behaviour of the 'addRecords()' function
$parsedRecordsArray); // 'records' - array of record(s) (with each record being a sub-array of fields)
return array($importDataArray, $recordsCount, $importRecordNumbersRecognizedFormatArray, $importRecordNumbersNotRecognizedFormatArray, $errors);
}
// --------------------------------------------------------------------
/*
// NOTE: by default, this function is currently disabled, since it uses DOM which is part of PHP 5 but must
// be installed as a separate PEAR extension for PHP 4. In order to provide widest compatibility with PHP 4,
// this function should be modified so that it makes use of ActiveLink's XML package instead:
// <http://www.active-link.com/software/>
// PUBMED TO CSA
// This function takes a PubMed ID and fetches corresponding PubMed XML record data from the PubMed server.
// Record data will be converted to CSA format which can be imported via 'import_csa_modify.php'.
//
// Authors: this function was originally written in Python by Andreas Hildebrandt <anhi@bioinf.uni-sb.de>
// and was ported to PHP by Marc Sturm <sturm@informatik.uni-tuebingen.de>
function pubmedToCsa($pubmedID)
{
global $contentTypeCharset;
$months = array('Jan' => '01', 'Feb' => '02', 'Mar' => '03', 'Apr' => '04', 'May' => '05', 'Jun' => '06',
'Jul' => '07', 'Aug' => '08', 'Sep' => '09', 'Oct' => '10', 'Nov' => '11', 'Dec' => '12');
$use_proxy=false;
function proxy_url($proxy_url)
{
$proxy_name = 'www-cache.informatik.uni-tuebingen.de';
$proxy_port = 3128;
$proxy_user = '';
$proxy_pass = '';
$proxy_cont = '';
$proxy_fp = fsockopen($proxy_name, $proxy_port);
if (!$proxy_fp) {return false;}
fputs($proxy_fp, "GET $proxy_url HTTP/1.0\r\nHost: $proxy_name\r\n");
fputs($proxy_fp, "Proxy-Authorization: Basic " . base64_encode("$proxy_user:$proxy_pass") . "\r\n\r\n");
while(!feof($proxy_fp)) { $proxy_cont .= fread($proxy_fp,4096); }
fclose($proxy_fp);
$proxy_cont = substr($proxy_cont, strpos($proxy_cont,"\r\n\r\n")+4);
return $proxy_cont;
}
if ($use_proxy)
$file = proxy_url("http://www.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=".escapeshellcmd($pubmedID)."&retmode=xml");
else
$file = file_get_contents("http://www.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=".escapeshellcmd($pubmedID)."&retmode=xml");
$doc = DOMDocument::loadXML($file);
$doc->preserveWhiteSpace = false;
$xpath = new DOMXPath($doc);
//-------------------------------------------------------------------------
// This parses the XML data:
// 1) Find the article (assume only one at this point...)
// 2) Do we need to add "et.al" to Authors?
// 3) Only one affiliation...
//-------------------------------------------------------------------------
$result = "";
$articles = $doc->getElementsByTagName('PubmedArticle');
foreach ($articles as $ref)
{
$med = $ref->getElementsByTagName('MedlineCitation')->item(0);
$article = $med->getElementsByTagName('Article')->item(0);
$title = $xpath->query("ArticleTitle/text()", $article)->item(0)->nodeValue;
$result .= "TI: Title\n $title\n";
$author_list = $article->getElementsByTagName('AuthorList')->item(0);
if ($author_list->attributes->getNamedItem('CompleteYN')->value == 'N')
$add_et_al = true;
else
$add_et_al = false;
$authors = $author_list->getElementsByTagName('Author');
$author_line = "";
foreach ($authors as $author)
{
$author_line .= $xpath->query("LastName/text()", $author)->item(0)->nodeValue;
$author_line .= ", ";
$forename = $xpath->query("ForeName/text()", $author);
if ($forename->length == 0)
$forename = $xpath->query("Initials/text()", $author);
if ($forename->length > 0)
$author_line .= $forename->item(0)->nodeValue;
$author_line .= "; ";
}
if ($add_et_al)
$author_line = substr($author_line,0,-2) . " et al.";
else
$author_line = substr($author_line,0,-2);
$result .= "AU: Author\n $author_line\n";
$affiliation = $xpath->query("Affiliation/text()", $article);
if ($affiliation->length > 0)
$result .= "AF: Affiliation\n ".$affiliation->item(0)->nodeValue."\n";
if ($ref->getElementsByTagName('MedlineJournalInfo')->length == 0) {
print "No useable source information given!";
exit(1);
}
$source = $xpath->query("MedlineJournalInfo/MedlineTA/text()", $med)->item(0)->nodeValue.". ";
if ($xpath->query("Journal/JournalIssue/Volume/text()", $article)->length > 0)
$source .= "Vol. " . $xpath->query("Journal/JournalIssue/Volume/text()", $article)->item(0)->nodeValue;
if ($xpath->query("Journal/JournalIssue/Issue/text()", $article)->length > 0)
$source .= " no. " . $xpath->query("Journal/JournalIssue/Issue/text()", $article)->item(0)->nodeValue;
if ($xpath->query("Pagination/MedlinePgn/text()", $article)->length > 0)
$source .= ", pp. " . $xpath->query("Pagination/MedlinePgn/text()", $article)->item(0)->nodeValue;
if ($xpath->query("Journal/JournalIssue/PubDate/Year", $article)->length > 0)
$source .= ". " . $xpath->query("Journal/JournalIssue/PubDate/Year/text()", $article)->item(0)->nodeValue . ".";
if ($source != "")
$result .= "SO: Source\n " . $source . "\n";
if ($xpath->query("Journal/ISSN", $article)->length > 0)
$result .= "IS: ISSN\n " . $xpath->query("Journal/ISSN/text()", $article)->item(0)->nodeValue . "\n";
if ($xpath->query("Abstract/AbstractText", $article)->length > 0)
$result .= "AB: Abstract\n " . $xpath->query("Abstract/AbstractText/text()", $article)->item(0)->nodeValue . "\n";
if ($xpath->query("Language", $article)->length > 0)
$result .= "LA: Language\n " . $xpath->query("Language/text()", $article)->item(0)->nodeValue . "\n";
$pubdate = "";
if ($xpath->query("Journal/JournalIssue/PubDate", $article)->length > 0)
{
$year = $xpath->query("Journal/JournalIssue/PubDate/Year/text()", $article);
if ($year > 0)
{
$pubdate = $year->item(0)->nodeValue;
$month = $xpath->query("Journal/JournalIssue/PubDate/Month/text()", $article);
if ($month > 0)
{
$pubdate .= $months[$month->item(0)->nodeValue];
$day = $xpath->query("Journal/JournalIssue/PubDate/Day/text()", $article);
if ($day->length > 0)
$pubdate .= $day->item(0)->nodeValue;
else
$pubdate .= "00";
}else{
$pubdate = $pubdate . "00";
}
}
$result .= "PD: Publication Date\n " . $pubdate . "\n";
}
$ptl = $article->getElementsByTagName('PublicationTypeList');
$publication_type = "";
if ($ptl->length > 0)
{
$pts = $xpath->query("PublicationTypeList/PublicationType/text()", $article);
for ($i=0; $i<$pts->length ; ++$i)
//{
$publication_type .= $pts->item($i)->nodeValue . "; ";
//}
}
if ($publication_type != "")
$result .= "PT: Publication Type\n " . substr($publication_type,0,-2) . "\n";
// collect all MeshHeadings and put them as descriptors.
// this currently ignores all other types of keywords
$descs = $xpath->query("MeshHeadingList/MeshHeading/DescriptorName/text()", $med);
$desc_line = "";
for ($i=0; $i<$descs->length ; ++$i)
$desc_line .= $descs->item($i)->nodeValue . "; ";
if ($desc_line != "")
$result .= "DE: Descriptors\n " . substr($desc_line,0,-2) . "\n";
$year = $xpath->query("Journal/JournalIssue/PubDate/Year/text()", $article) ;
if ($year > 0)
$result .= "PY: Publication Year\n " . $year->item(0)->nodeValue . "\n";
}
if ($contentTypeCharset == "ISO-8859-1")
$result = iconv("UTF-8", "ISO-8859-1//TRANSLIT", $result); // convert text from Unicode UTF-8 encoding to ISO Latin 1
return $result;
}
*/
// --------------------------------------------------------------------
?>