WEETAT - 2019-01-14

Hi all
I need to web crawling the page URL:
https://stomp.straitstimes.com/singapore-seen/moe-to-take-action-against-lewd-instagram-account-targeting-junior-college-girls

We have error in line of scraper.execute();

private Map<string, object=""> crawlArticleData(String sourceUrl, Resource configFile) throws IOException {</string,>

    InputSource configIn = new InputSource(configFile.getInputStream());
    //Execute Web Harvest process to extract content
    ScraperConfiguration config = new ScraperConfiguration(configIn);

    Scraper scraper = null;
    String articleContent = "";
    List<String> imageUrls=null;
    try {

        scraper = new Scraper(config, "");
        //Config timeout for httpClient used in Scraper
        org.apache.commons.httpclient.HttpClient scraperHttpClient = scraper.getHttpClientManager().getHttpClient();
        scraperHttpClient.getParams().setParameter("http.socket.timeout", new Integer(SO_TIMEOUT));
        scraperHttpClient.getParams().setParameter("http.connection.timeout", new Integer(CONNECTION_TIMEOUT));

        scraper.addVariableToContext("articleUrl", sourceUrl);
        scraper.execute();

        //get article content
        Variable articleContentVariable = scraper.getContext().getVar("articleContent");
        if (articleContentVariable == null || articleContentVariable.toBinary() == null) {
            logger.debug("Fail to extract body content for " + sourceUrl);
            return null;
        }

        articleContent = new String(articleContentVariable.toBinary(), "UTF-8");
        if ( !org.apache.commons.lang.StringUtils.isBlank( articleContent ) && articleContent.length() > 2000000 )
        {
            logger.info( "Reject article because content is too large. Article size: " + articleContent.length() );
            throw new IOException();
        }

        Variable imageUrlVariable = scraper.getContext().getVar("imageUrl");

        imageUrls = new ArrayList<String>();

        if (imageUrlVariable != null) {
            String imageUrlsString = imageUrlVariable.toString();
            if (!StringUtils.isEmpty(imageUrlsString)) {
                imageUrls.addAll(Arrays.asList(imageUrlsString.split("\n")));
            }
        }
    }catch (Exception  ex ){
          logger.error( "Crawler error sourceUrl="+sourceUrl, ex);
    }finally{
        //clean up
        scraper.dispose();
        configIn.getByteStream().close();
    }

    Map<String, Object> articleData = new HashMap<String, Object>();
    articleData.put("articleContent", articleContent);
    articleData.put("imageUrls", imageUrls);

    if ( !org.apache.commons.lang.StringUtils.isBlank( articleContent ) && articleContent.length() > 0 )
        logger.info( "Crawl completed. Article content size: " + articleContent.length() );
    else
        logger.info( "Crawl completed. " );

    return articleData;
}