$ mongodump –host 10.0.200.100 –collection items –db crawler_data IMPORTANT: If it shows a “locale” related error, run: $ export LC_ALL=“en_US.UTF-8” OR, if that does not work: $ apt-get install locales $ locale-gen en_US.UTF-8 $ dpkg-reconfigure locales (informe o locale número 138 - en_US.UTF-8) , and try again.
$ mongorestore -d database_name /path/of/the/mongo/dump IMPORTANT: If it shows a “locale” related error, run: $ export LC_ALL=“en_US.UTF-8” , and try again. NOTE: the “-d” parameter is now required from mongo 3 on.
#!/bin/bash fields="_id.id,ean,name,department,category,url,image_url,price,special_price,instalments,instalment_value,available,brand,tyre__width,tyre__aspect_ratio,tyre__rim_diameter,tyre__load_index,tyre__speed_rating" SPIDERS=“extra gbg_pneus americanas pneu_free bridgestone casasbahia pneus_albuquerque pneustore pneu_plus pneumar kd_pneus hipervarejo pneusola colombo caçula_de_pneus tireshop dellavia walmart autoz itaro pneus_facil pontofrio magazine_luiza dpaschoal” for spider in $SPIDERS; do mongoexport –host 10.0.200.100 -d test2 -c items -q “{"_meta.spider_name": "$spider", "_spider_run_id": ObjectId("555f9343075ce800101dedb5")}” -f $fields –csv > ./$spider.csv; done
FIND / COUNT:
db.items.find({"_meta.spider_name": “pneumania”, “sku”: “163”})
db.items.find()
db.items.count()
REMOVE (DELETE):
db.items.remove({"_id.site": “Arcondicionado”})
DROP DATABASE:
IMPORTANT: use with caution, it does not ask for confirmation.
use database_to_drop
db.dropDatabase()
GET INDEXES:
db.items.getIndexes()
CREATE AN INDEX ON 2 FIELDS:
db.items.ensureIndex({“domain”: 1, “sku”: 1});
db.items.ensureIndex({“compat_domain”: 1, “_id.id”: 1});
ORDER BY FIELD IN DESCRESCENT ORDER:
db.authentication.find({}).sort({‘updated_at’: -1});
RENAME DOCUMENT FIELD REFUSED_REASONS
TO REFUSED_REASONS_OLD
:
db.authentication.update( {‘refused_rules’: {$ne: null}}, { $rename: { ‘refused_reasons’: ‘refused_reasons_old’ } }, {multi: true} )
REMOVE DOCUMENT FIELD REFUSED_REASONS_OLD
FROM ALL DOCUMENTS:
db.authentication.update( {‘refused_reasons_new’: {$ne: null}}, { $unset: { ‘refused_reasons_old’: 1 } }, {multi: true} )
SELECT DOCUMENTS CREATED ON THE LAST 24 HOURS:
db.some_collection.find({“executed_at”:{$gte: new Date(Date.now() - 246060 * 1000)}}).sort({“executed_at”: -1})
SELECT DOCUMENTS CREATED ON A RANGE OF DATES:
db.my_collection.find({“created_at”: {$gte: ISODate(“2018-07-02 02:00:00.000Z”), $lt: ISODate(“2018-07-03 02:00:00.000Z”)}}).count()
SELECT SPECIFIC (2) FIELDS FROM A DOCUMENT:
db.my_collection.find({}, {executed_at:1, increased: 1}).sort({“executed_at”: -1}).limit(9)
SELECT SPECIFIC FIELDS ON A SELECT:
db.queues.find({}, {
“spider_run_id”: 1,
“save_time”: 1
});
(Equivalent in SQL: SELECT spider_run_id, save_time FROM queues)
UPDATE MULTIPLE RECORDS ON MONGO:
IMPORTANT: by default, “find_and_modify” and “update” just operate on a single record:
db.items.update(
{"_meta.spider_name":“itaro”},
{$set:{“compat_domain”:“www.itaroUPDATED.com.br”}},
false,
true)
The third argument is “upsert”, and the fourth is “multi” - that last one updates all records matching the condition (first argument).
SHOW ALL DATABASES ON A MONGO INSTANCE:
$ show dbs;
mongo var db = connect(‘crawler_data’) db.items.find({"_id.site": “PontoFrio”}) db.items.count({"_id.site": “PontoFrio”}) db.items.runCommand({“distinct”: “items”, “key”: “_id.site”}) db.spider_runs.runCommand({“distinct”: “spider_runs”, “key”: “site”}) db.items.findOne({’_meta.spider_name’:‘poloar’})
db.items.find({"_id.site": “GBG Pneus”, “brand”: /Pneu /}).forEach(function(doc) { doc.brand = doc.brand.replace(/Pneu /g, ‘’); db.items.save(doc); }); On the example above I find all documents that have the “Pneu " string on the “brand” field and that have “_id.site” as “GBG Pneus”, and change that regex’s occurence found to “”. That will change, for example, “Pneu Maggion” to “Maggion”.
db.extractions_quality_summary.runCommand({“distinct”: “extractions_quality_summary”, “key”: “_site_name”}) Below is another way to make a distinct query with a condition (mongo 3.4 up): $ db.my_collection.distinct( “book_id”, { “created_at”: {$gte: ISODate(“2018-10-01 00:00:00.000Z”)}}) ( returns all book_ids created after 2018-10-01 ) Below I sort by “id”: db.extractions_quality_summary.find({"_site_name”: “Pneus Albuquerque”}).sort({“id”: 1}) Below I sort by “id DESC”: db.extractions_quality_summary.find({"_site_name": “Pneus Albuquerque”}).sort({“id”: -1}) SELECT * FROM spider_runs WHERE finish_time is NOT NULL db.spider_runs.find({“finish_time”: {"$ne": null}}); QUANTIDADE DE ITEMS EM UM SPIDER_RUN_ID: db.items.count({“spider_run_id” : ObjectId(“555e2b8e0bd0d30011b21a4c”)}) db.spider_runs.find({“finish_time”:{"$ne":null}}).sort({’_id’:-1}).limit(7); db.demo.find({“person_id”: {"$in": [“P001”, “P002”]}})
(adapted from: http://stackoverflow.com/questions/2298870/mongodb-get-names-of-all-keys-in-collection) On the mongo client (mongo), type the following code: use crawler_data; var my_collection_name = “items” mr = db.runCommand({ “mapreduce” : my_collection_name, “map” : function() { for (var key in this) { emit(key, null); } }, “reduce” : function(key, stuff) { return null; }, “out”: my_collection_name + “_keys” }); db[mr.result].distinct("_id")
// String to Integer db.db-name.find({field-name : {$exists : true}}).forEach( function(obj) { obj.field-name = new NumberInt( obj.field-name ); db.db-name.save(obj); } ); // Integer to String db.db-name.find({field-name : {$exists : true}}).forEach( function(obj) { obj.field-name = “"+obj.field-name; db.db-name.save(obj); } ); E.g.: db.items.find({"_id.id” : {$exists : true}}).forEach( function(obj) { obj._id.id = “"+obj._id.id; db.items.save(obj); } ); db.items.find({"_meta.spider_name”: ‘casas_bahia’, “_id.id” : {$exists : true}}).forEach( function(obj) { obj._id.id = “"+obj._id.id; db.items.save(obj); } ); db.items.find({"_meta.spider_name”: ‘autoz’, “ean” : {$exists : true}}).forEach( function(obj) { obj.ean = “"+obj.ean; db.items.save(obj); } );
Supposing I have documents with the fields increased (boolean)
and executed_at (timestamp)
. I want to count how many increased=true and increased=false I have for each day.
db.increase_limit_process.aggregate([
{
"$project":{ // here we must have the same fields used on grouping
"is_increased": "$increased",
"yearMonthDayUTC":{ // here we convert the executed_at from timestamp to a date, to be possible to group by date
"$dateToString":{
"format":"%Y-%m-%d",
"date":"$executed_at"
}
}
}
},
{
"$group":{
"_id":{ // this field is required, and must contain the fields involved on the grouping
"is_increased":"$is_increased",
"date":"$yearMonthDayUTC"
},
"count":{ // here I count one for each document, to get the total by date
"$sum":1 // although conceptually an integer, it will be presented as a float
}
}
},
{
"$sort":{
"_id.date":-1,
"_id.is_increased": 1
}
},
{
"$limit": 10
}
])
This query would have results like:
/* 1 */
{
"_id" : {
"is_increased" : false,
"date" : "2018-07-12"
},
"count" : 1121.0
}
/* 2 */
{
"_id" : {
"is_increased" : false,
"date" : "2018-07-11"
},
"count" : 1827.0
}
/* 3 */
{
"_id" : {
"is_increased" : true,
"date" : "2018-07-11"
},
"count" : 22.0
}
Still using the same document schema, below is a slighty different query.
It outputs per day, the total count of records where increased=true
:
db.increase_limit_process.aggregate([
{
"$match":{"increased": true} // since this is also called "pipeline" by mongo, the order matters. The conditions to match must always come first.
},
{
"$project":{
"yearMonthDayUTC":{
"$dateToString":{
"format":"%Y-%m-%d",
"date":"$executed_at"
}
}
}
},
{
"$group":{
"_id": "$yearMonthDayUTC",
"count":{
"$sum":1
}
}
},
{
"$sort":{
"_id":-1
}
},
{
"$limit": 10
}
]
)
, resulting in:
/* 1 */
{
"_id" : "2018-07-11",
"count" : 22.0
}
/* 2 */
{
"_id" : "2018-07-10",
"count" : 6.0
}
/* 3 */
{
"_id" : "2018-07-06",
"count" : 2.0
}
/* 4 */
{
"_id" : "2018-07-07",
"count" : 3.0
}
/* 5 */
{
"_id" : "2018-07-05",
"count" : 4.0
}
/* 6 */
{
"_id" : "2018-07-09",
"count" : 4.0
}
db.serverStatus()