This is the node js script to scrape google maps using Puppeteer.
const puppeteer = require('puppeteer');
sqlmode = 1;
//if (sqlmode == 1) {
const mysql = require('mysql');
const connection = mysql.createConnection({
host: 'localhost',
user: 'root',
password: '',
database: 'google_pg'
});
connection.connect((err) => {
if (err) throw err;
console.log('Connected to MySQL Server!');
});
//}
let scrape = async() => {
const browser = await puppeteer.launch({ headless: false, slowMo: 250 });
const page = await browser.newPage();
await page.goto(keyword);
await page.setViewport({ width: 1366, height: 663 });
await page.waitForSelector('g-more-link');
await page.waitForTimeout(2000);
const itemsList = await page.$('g-more-link a');
await itemsList.click();
await page.waitForNavigation({ waitUntil: 'networkidle2' });
await page.waitForTimeout(10000);
//for (var p = 2; p <= 3; p++) {
const cidHandle = await page.$$("div#search a[data-cid]");
const cid = await page.$$eval("div#search a[data-cid]", el => el.map(x => x.getAttribute("data-cid")));
const business_name = await page.$$eval('a[data-cid] div[role="heading"]', el => el.map(x => x.innerText));
for (var i = 0; i < business_name.length; i++) {
let cids = [];
let business_names = [];
let business_categories = [];
save_new_record = 1;
// geting up the place id
const cidHandle = await page.$$("div#search a[data-cid]");
const cid = await page.$$eval("div#search a[data-cid]", el => el.map(x => x.getAttribute("data-cid")));
const business_name = await page.$$eval('a[data-cid] div[role="heading"]', el => el.map(x => x.innerText));
for (var i = 0; i < business_name.length; i++) {
cids.push(cid[i]);
if (sqlmode == 1) {
var sql = "insert into papa_places_tmp(place_id, place_title) values('" +
cid[i] + "','" +
business_name[i] + "')";
connection.query(sql, function(err, result) {
if (err) { save_new_record = 0; }
console.log('1 record inserted');
});
}
console.log(business_name[i] + " : " + cid[i]);
}
let bnames = [];
let addresss = [];
let phones = [];
let ratings = [];
let totalReviews = [];
let images = [];
let categories = [];
let timings = [];
// for each place id
for (var i = 0; i < cidHandle.length; i++) {
await cidHandle[i].click();
await page.waitForSelector('h2[data-attrid="title"]');
//await page.waifFor
await page.waitForTimeout(2000);
// get title
name_str = "";
const nameHandle1 = await page.$('h2[data-attrid="title"]');
if (nameHandle1 !== null) {
let name1 = (await nameHandle1.evaluate(node => node.innerText));
bnames.push(name1);
if (sqlmode == 1) {
var sql = "update papa_places_tmp set place_name='" + name1 + "' where place_id='" + cids[i] + "' ";
connection.query(sql, function(err, result) {
console.log("1 record inserted")
});
}
name_str = name1;
} else {
bnames.push('na');
name_str = 'na';
}
console.log(i + 1 + " name:" + " : " + name_str);
// get address
address = "";
const addressHandle = await page.$('div[data-attrid="kc:/location/location:address"] span:nth-child(2)');
if (addressHandle !== null) {
let address1 = (await addressHandle.evaluate(node => node.innerText));
addresss.push[address1];
address1 = address1.trim()
ad = address1.split(",");
address_state = "";
address_district = "";
address_locality_1 = "";
address_locality_2 = "";
address_pin = "";
for (k = ad.length - 1, j = 0; k >= 0; k--, j++) {
if (j == 0) {
aa = ad[k].trim().split(" ");
address_state = aa[0];
address_pin = aa[1];
console.log(aa[0] + " : " + aa[1]);
console.log(address1);
}
if (j == 1)
address_district = ad[k];
if (j == 2)
address_locality_1 = ad[k];
if (j == 3)
address_locality_2 = ad[k];
}
if (sqlmode == 1) {
var sql = "update papa_places_tmp set place_address='" + address1 + "', " +
"address_pin='" + address_pin + "', " +
"address_state='" + address_state + "', " +
"address_district='" + address_district + "', " +
"address_locality_1='" + address_locality_1 + "', " +
"address_locality_2='" + address_locality_2 + "' " +
" where place_id='" + cids[i] + "' ";
connection.query(sql, function(err, result) {
console.log("1 record inserted")
});
}
address = address1;
} else {
addresss.push('na');
address1 = 'na';
}
console.log("address_pin:" + address_pin);
console.log("address_state:" + address_state);
console.log("address_locality_1:" + address_locality_1);
console.log("address_locality_2:" + address_locality_2);
console.log("address_district:" + address_district);
// get phones
mobile = "";
const phoneHandle = await page.$('div[data-attrid="kc:/collection/knowledge_panels/has_phone:phone"] span:nth-child(2) > span > a > span');
if (phoneHandle !== null) {
let phone = (await phoneHandle.evaluate(node => node.innerHTML));
phones.push(phone);
if (sqlmode == 1) {
var sql = "update papa_places_tmp set place_mobile='" + phone + "' where place_id='" + cids[i] + "' ";
connection.query(sql, function(err, result) {
console.log("1 record updated");
});
}
mobile = phone;
} else {
phones.push('na');
mobile = 'na';
}
console.log("phone:" + mobile);
// get rating
rating_str = "";
const ratedhandle = await page.$('g-review-stars span');
if (ratedhandle != null) {
let rating = (await ratedhandle.evaluate(node => node.getAttribute('aria-label')));
ratings.push(rating); // Input - Rated 4.5 out of 5, Output - 4.5
if (sqlmode == 1) {
var sql = "update papa_places_tmp set place_rating='" + rating.split(" ")[1] + "' where place_id='" + cids[i] + "' ";
connection.query(sql, function(err, result) {
console.log("1 record updated");
});
}
rating_str = rating;
} else {
rateds.push('na');
rating_str = 'na';
}
console.log("rated: " + rating_str);
// get total reviews
totalreview_str = "";
const totalreviewhandle = await page.$('a[data-async-trigger="reviewDialog"] span');
if (totalreviewhandle != null) {
let totalreview = (await totalreviewhandle.evaluate(node => node.innerText));
totalReviews.push(totalreview); // 2 google reviews - storing only 2
if (sqlmode == 1) {
var sql = "update papa_places_tmp set place_totalreview='" + totalreview.split(" ")[0] + "' where place_id='" + cids[i] + "' ";
connection.query(sql, function(err, result) {
console.log("1 record updated");
});
}
totalreview_str = totalreview;
} else {
totalReviews.push('na');
totalreview_str = 'na';
}
console.log("totalreview: " + totalreview_str);
// get image
//image handle
image_str = "";
const imageshandle = await page.$('div[data-attrid="kc:/location/location:hotel media"] > div > a > div');
if (imageshandle != null) {
let image = (await imageshandle.evaluate(node => node.getAttribute('style'))) //(await imageshandle.evaluate(node => node.getAttribute('src')));
if (image.length >= 1) {
image = image.replace('background-image:url(', '');
image = image.replace(')', '');
image = image.replace(image.substr(image.indexOf("="), image.length), "");
}
images.push(image);
if (sqlmode == 1) {
var sql = "update papa_places_tmp set place_image='" + image + "' where place_id='" + cids[i] + "' ";
connection.query(sql, function(err, result) {
console.log("1 record updated");
});
}
image_str = image;
} else {
images.push('na');
image_str = 'na';
}
console.log("image: " + image_str);
// get category
// category handle
category_str = "";
const categoryhandle = await page.$('div[data-attrid="kc:/local:lu attribute list"] > div > div > span');
if (categoryhandle != null) {
let category = (await categoryhandle.evaluate(node => node.innerText));
categories.push(category);
if (sqlmode == 1) {
var sql = "update papa_places_tmp set place_tags='" + category + "' where place_id='" + cids[i] + "' ";
connection.query(sql, function(err, result) {
console.log("1 record updated");
});
}
category_str = category;
} else {
categories.push('na');
category_str = 'na';
}
console.log("category: " + category_str);
// get timing
timing_str = "";
const timinghandle = await page.$('div[data-attrid="kc:/location/location:hours"] > div > div > div > div:nth-child(2) > div > table');
if (timinghandle != null) {
let timing = (await timinghandle.evaluate(node => node.innerHTML));
timings.push(timing);
if (sqlmode == 1) {
var sql = "update papa_places_tmp set place_timings='" + timing + "' where place_id='" + cids[i] + "' ";
connection.query(sql, function(err, result) {
console.log("1 record updated");
});
}
timing_str = timing;
} else {
timings.push('na');
timing_str = 'na';
}
console.log("timing: " + timing_str);
}
// next page
const pages = await page.$('a[aria-label="Page ' + p + '"]');
if (pages !== null) {
await pages.click();
await page.waitForNavigation({ waitUntil: 'networkidle2' });
await page.waitForTimeout(4000);
await page.waitForSelector('div#search a[data-cid]');
await page.waitForTimeout(5000);
// await wait for
}
}
// end page}
browser.close();
};
var keyword = 'https://www.google.com/search?q=' + decodeURIComponent('pg in sector 17');
scrape(keyword).then((value) => {
console.log('');
})