Web Scraping

Scraping Google Maps using Puppeteer

This is the node js script to scrape google maps using Puppeteer.

const puppeteer = require('puppeteer');

sqlmode = 1;
//if (sqlmode == 1) {
const mysql = require('mysql');
const connection = mysql.createConnection({
    host: 'localhost',
    user: 'root',
    password: '',
    database: 'google_pg'
});
connection.connect((err) => {
    if (err) throw err;
    console.log('Connected to MySQL Server!');
});
//}
let scrape = async() => {

    const browser = await puppeteer.launch({ headless: false, slowMo: 250 });
    const page = await browser.newPage();
    await page.goto(keyword);
    await page.setViewport({ width: 1366, height: 663 });
    await page.waitForSelector('g-more-link');
    await page.waitForTimeout(2000);
    const itemsList = await page.$('g-more-link a');
    await itemsList.click();
    await page.waitForNavigation({ waitUntil: 'networkidle2' });
    await page.waitForTimeout(10000);

    //for (var p = 2; p <= 3; p++) {
    const cidHandle = await page.$$("div#search a[data-cid]");
    const cid = await page.$$eval("div#search a[data-cid]", el => el.map(x => x.getAttribute("data-cid")));
    const business_name = await page.$$eval('a[data-cid] div[role="heading"]', el => el.map(x => x.innerText));
    for (var i = 0; i < business_name.length; i++) {
        let cids = [];
        let business_names = [];
        let business_categories = [];

        save_new_record = 1;
        // geting up the place id

        const cidHandle = await page.$$("div#search a[data-cid]");
        const cid = await page.$$eval("div#search a[data-cid]", el => el.map(x => x.getAttribute("data-cid")));
        const business_name = await page.$$eval('a[data-cid] div[role="heading"]', el => el.map(x => x.innerText));


        for (var i = 0; i < business_name.length; i++) {
            cids.push(cid[i]);
            if (sqlmode == 1) {
                var sql = "insert into papa_places_tmp(place_id, place_title) values('" +
                    cid[i] + "','" +
                    business_name[i] + "')";
                connection.query(sql, function(err, result) {
                    if (err) { save_new_record = 0; }
                    console.log('1 record inserted');
                });
            }
            console.log(business_name[i] + " : " + cid[i]);
        }
        let bnames = [];
        let addresss = [];
        let phones = [];
        let ratings = [];
        let totalReviews = [];
        let images = [];
        let categories = [];
        let timings = [];
        // for each place id
        for (var i = 0; i < cidHandle.length; i++) {

            await cidHandle[i].click();
            await page.waitForSelector('h2[data-attrid="title"]');
            //await page.waifFor
            await page.waitForTimeout(2000);

            // get title
            name_str = "";
            const nameHandle1 = await page.$('h2[data-attrid="title"]');
            if (nameHandle1 !== null) {
                let name1 = (await nameHandle1.evaluate(node => node.innerText));
                bnames.push(name1);
                if (sqlmode == 1) {
                    var sql = "update papa_places_tmp set place_name='" + name1 + "' where place_id='" + cids[i] + "' ";
                    connection.query(sql, function(err, result) {
                        console.log("1 record inserted")
                    });
                }
                name_str = name1;
            } else {
                bnames.push('na');
                name_str = 'na';
            }
            console.log(i + 1 + " name:" + " : " + name_str);

            // get address
            address = "";
            const addressHandle = await page.$('div[data-attrid="kc:/location/location:address"] span:nth-child(2)');
            if (addressHandle !== null) {
                let address1 = (await addressHandle.evaluate(node => node.innerText));
                addresss.push[address1];
                address1 = address1.trim()
                ad = address1.split(",");
                address_state = "";
                address_district = "";
                address_locality_1 = "";
                address_locality_2 = "";
                address_pin = "";
                for (k = ad.length - 1, j = 0; k >= 0; k--, j++) {
                    if (j == 0) {
                        aa = ad[k].trim().split(" ");
                        address_state = aa[0];
                        address_pin = aa[1];
                        console.log(aa[0] + " : " + aa[1]);
                        console.log(address1);
                    }
                    if (j == 1)
                        address_district = ad[k];
                    if (j == 2)
                        address_locality_1 = ad[k];
                    if (j == 3)
                        address_locality_2 = ad[k];
                }
                if (sqlmode == 1) {
                    var sql = "update papa_places_tmp set place_address='" + address1 + "', " +
                        "address_pin='" + address_pin + "', " +
                        "address_state='" + address_state + "', " +
                        "address_district='" + address_district + "', " +
                        "address_locality_1='" + address_locality_1 + "', " +
                        "address_locality_2='" + address_locality_2 + "' " +
                        " where place_id='" + cids[i] + "' ";
                    connection.query(sql, function(err, result) {
                        console.log("1 record inserted")
                    });
                }
                address = address1;
            } else {
                addresss.push('na');
                address1 = 'na';
            }
            console.log("address_pin:" + address_pin);
            console.log("address_state:" + address_state);
            console.log("address_locality_1:" + address_locality_1);
            console.log("address_locality_2:" + address_locality_2);
            console.log("address_district:" + address_district);



            // get phones
            mobile = "";
            const phoneHandle = await page.$('div[data-attrid="kc:/collection/knowledge_panels/has_phone:phone"] span:nth-child(2) > span > a > span');
            if (phoneHandle !== null) {
                let phone = (await phoneHandle.evaluate(node => node.innerHTML));
                phones.push(phone);
                if (sqlmode == 1) {
                    var sql = "update papa_places_tmp set place_mobile='" + phone + "' where place_id='" + cids[i] + "' ";
                    connection.query(sql, function(err, result) {
                        console.log("1 record updated");
                    });
                }
                mobile = phone;

            } else {
                phones.push('na');
                mobile = 'na';
            }
            console.log("phone:" + mobile);

            // get rating

            rating_str = "";
            const ratedhandle = await page.$('g-review-stars span');
            if (ratedhandle != null) {
                let rating = (await ratedhandle.evaluate(node => node.getAttribute('aria-label')));
                ratings.push(rating); // Input -  Rated 4.5 out of 5, Output - 4.5
                if (sqlmode == 1) {
                    var sql = "update papa_places_tmp set place_rating='" + rating.split(" ")[1] + "' where place_id='" + cids[i] + "' ";
                    connection.query(sql, function(err, result) {
                        console.log("1 record updated");
                    });
                }
                rating_str = rating;
            } else {
                rateds.push('na');
                rating_str = 'na';
            }
            console.log("rated: " + rating_str);

            // get total reviews
            totalreview_str = "";
            const totalreviewhandle = await page.$('a[data-async-trigger="reviewDialog"] span');
            if (totalreviewhandle != null) {
                let totalreview = (await totalreviewhandle.evaluate(node => node.innerText));
                totalReviews.push(totalreview); // 2 google reviews - storing only 2
                if (sqlmode == 1) {
                    var sql = "update papa_places_tmp set place_totalreview='" + totalreview.split(" ")[0] + "' where place_id='" + cids[i] + "' ";
                    connection.query(sql, function(err, result) {
                        console.log("1 record updated");
                    });
                }
                totalreview_str = totalreview;
            } else {
                totalReviews.push('na');
                totalreview_str = 'na';
            }
            console.log("totalreview: " + totalreview_str);

            // get image
            //image handle
            image_str = "";
            const imageshandle = await page.$('div[data-attrid="kc:/location/location:hotel media"] > div > a > div');
            if (imageshandle != null) {
                let image = (await imageshandle.evaluate(node => node.getAttribute('style'))) //(await imageshandle.evaluate(node => node.getAttribute('src')));
                if (image.length >= 1) {
                    image = image.replace('background-image:url(', '');
                    image = image.replace(')', '');
                    image = image.replace(image.substr(image.indexOf("="), image.length), "");
                }
                images.push(image);
                if (sqlmode == 1) {
                    var sql = "update papa_places_tmp set place_image='" + image + "' where place_id='" + cids[i] + "' ";
                    connection.query(sql, function(err, result) {
                        console.log("1 record updated");
                    });
                }
                image_str = image;

            } else {
                images.push('na');
                image_str = 'na';
            }
            console.log("image: " + image_str);


            // get category
            // category handle
            category_str = "";
            const categoryhandle = await page.$('div[data-attrid="kc:/local:lu attribute list"] > div > div > span');
            if (categoryhandle != null) {
                let category = (await categoryhandle.evaluate(node => node.innerText));
                categories.push(category);
                if (sqlmode == 1) {
                    var sql = "update papa_places_tmp set place_tags='" + category + "' where place_id='" + cids[i] + "' ";
                    connection.query(sql, function(err, result) {
                        console.log("1 record updated");
                    });
                }
                category_str = category;
            } else {
                categories.push('na');
                category_str = 'na';
            }
            console.log("category: " + category_str);

            // get timing
            timing_str = "";
            const timinghandle = await page.$('div[data-attrid="kc:/location/location:hours"] > div > div > div > div:nth-child(2) > div > table');
            if (timinghandle != null) {
                let timing = (await timinghandle.evaluate(node => node.innerHTML));
                timings.push(timing);
                if (sqlmode == 1) {
                    var sql = "update papa_places_tmp set place_timings='" + timing + "' where place_id='" + cids[i] + "' ";
                    connection.query(sql, function(err, result) {
                        console.log("1 record updated");
                    });
                }
                timing_str = timing;
            } else {
                timings.push('na');
                timing_str = 'na';
            }
            console.log("timing: " + timing_str);


        }
        // next page
        const pages = await page.$('a[aria-label="Page ' + p + '"]');
        if (pages !== null) {
            await pages.click();
            await page.waitForNavigation({ waitUntil: 'networkidle2' });
            await page.waitForTimeout(4000);
            await page.waitForSelector('div#search a[data-cid]');
            await page.waitForTimeout(5000);
            // await wait for
        }
    }
    // end page}

    browser.close();
};

var keyword = 'https://www.google.com/search?q=' + decodeURIComponent('pg in sector 17');
scrape(keyword).then((value) => {
    console.log('');
})