add index_write

This commit is contained in:
linjinming 2021-03-19 18:46:25 +08:00
parent 09c1c6a45d
commit 8f5fe9d199
26 changed files with 5388 additions and 0 deletions

View File

@ -0,0 +1,28 @@
cmake_minimum_required(VERSION 2.6)
PROJECT(index_write)
EXECUTE_PROCESS(COMMAND git log -1 --pretty=format:%h . OUTPUT_VARIABLE version)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -lrt -Wall")
AUX_SOURCE_DIRECTORY(. main)
LINK_DIRECTORIES(
${PROJECT_SOURCE_DIR}/../../comm
${PROJECT_SOURCE_DIR}/../../3rdlib/jsoncpp/lib
${PROJECT_SOURCE_DIR}/../../comm/stat
)
ADD_EXECUTABLE(index_write ${main})
target_include_directories(index_write PUBLIC
../../3rdlib/jsoncpp/include
../../comm
../../comm/stat
../index_storage/api/c_api_cc
)
add_definitions(-DGIT_VERSION="${version}" -DMAIN)
target_link_libraries(index_write libcommon.a libdtc.so jsoncpp stat ssl)
SET_TARGET_PROPERTIES(index_write PROPERTIES RUNTIME_OUTPUT_DIRECTORY "./bin")

View File

@ -0,0 +1,529 @@
/*
* =====================================================================================
*
* Filename: add_request_proc.cc
*
* Description: AddReqProc class definition.
*
* Version: 1.0
* Created: 09/08/2020 10:02:05 PM
* Revision: none
* Compiler: gcc
*
* Author: shrewdlin, linjinming@jd.com
* Company: JD.com, Inc.
*
* =====================================================================================
*/
#include "add_request_proc.h"
#include "index_tbl_op.h"
#include "geohash.h"
#include "split_manager.h"
#include <sstream>
#include <arpa/inet.h>
#include <netinet/in.h>
#include <iomanip>
AddReqProc::AddReqProc(){
}
AddReqProc::AddReqProc(const Json::Value& jf, InsertParam& insert_param){
doc_version = insert_param.doc_version;
trans_version = insert_param.trans_version;
app_id = insert_param.appid;
doc_id = insert_param.doc_id;
json_field = jf;
}
AddReqProc::~AddReqProc(){
}
void AddReqProc::do_stat_word_freq(vector<vector<string> > &strss, map<string, item> &word_map, string extend) {
string word;
uint32_t id = 0;
ostringstream oss;
vector<vector<string> >::iterator iters = strss.begin();
uint32_t index = 0;
for(;iters != strss.end(); iters++){
index++;
vector<string>::iterator iter = iters->begin();
log_debug("start do_stat_word_freq, appid = %u\n",app_id);
for (; iter != iters->end(); iter++) {
word = *iter;
if (!SplitManager::Instance()->wordValid(word, app_id, id)){
continue;
}
if (word_map.find(word) == word_map.end()) {
item it;
it.doc_id = doc_id;
it.freq = 1;
it.extend = extend;
it.indexs.push_back(index);
word_map.insert(make_pair(word, it));
}
else {
word_map[word].freq++;
word_map[word].indexs.push_back(index);
}
oss << (*iter) << "|";
}
}
log_debug("split: %s",oss.str().c_str());
}
void AddReqProc::do_stat_word_freq(vector<string> &strss, map<string, item> &word_map) {
string word;
vector<string>::iterator iters = strss.begin();
uint32_t index = 0;
for (; iters != strss.end(); iters++) {
index++;
word = *iters;
if (word_map.find(word) == word_map.end()) {
item it;
it.doc_id = doc_id;
it.freq = 1;
it.indexs.push_back(index);
word_map.insert(make_pair(word, it));
}
else {
word_map[word].freq++;
word_map[word].indexs.push_back(index);
}
}
}
int AddReqProc::deal_index_tag(struct table_info *tbinfo, string field_name){
int ret =0;
map<string, item> word_map;
vector<vector<string> > split_content;
switch(tbinfo->field_type){
case FIELD_STRING:
case FIELD_TEXT:
if(json_field[field_name].isString()){
if (tbinfo->segment_tag == SEGMENT_NGRAM) { // NGram split mode
vector<string> ngram_content = SplitManager::Instance()->split(json_field[field_name].asString());
do_stat_word_freq(ngram_content, word_map);
}
else if (tbinfo->segment_tag == SEGMENT_CHINESE || tbinfo->segment_tag == SEGMENT_ENGLISH) { // use intelligent_info
string str = json_field[field_name].asString();
// segment_tag为3对应的字段内容必须为全中文为4对应的的字段不能包含中文
if (tbinfo->segment_tag == SEGMENT_CHINESE && allChinese(str) == false) {
log_error("segment_tag is 3, the content[%s] must be Chinese.", str.c_str());
return RT_ERROR_FIELD_FORMAT;
}
if (tbinfo->segment_tag == SEGMENT_ENGLISH && noChinese(str) == false) {
log_error("segment_tag is 4, the content[%s] can not contain Chinese.", str.c_str());
return RT_ERROR_FIELD_FORMAT;
}
item it;
it.doc_id = doc_id;
it.freq = 1;
if(tbinfo->segment_feature == SEGMENT_FEATURE_SNAPSHOT){
Json::FastWriter ex_writer;
it.extend = ex_writer.write(snapshot_content);
}
word_map.insert(make_pair(str, it));
vector<IntelligentInfo> info;
bool flag = false;
get_intelligent(str, info, flag);
if (flag) {
stringstream ss;
ss << app_id << "#" << tbinfo->field_value;
ret = g_hanpinIndexInstance.do_insert_intelligent(ss.str(), doc_id, str, info, doc_version);
if(0 != ret){
roll_back();
return ret;
}
intelligent_keys.push_back(ss.str());
}
}
else {
split_content = SplitManager::Instance()->split(json_field[field_name].asString(), app_id);
string extend = "";
if(tbinfo->segment_feature == SEGMENT_FEATURE_SNAPSHOT){
Json::FastWriter ex_writer;
extend = ex_writer.write(snapshot_content);
}
do_stat_word_freq(split_content, word_map, extend);
split_content.clear();
}
ret = g_IndexInstance.do_insert_index(word_map, app_id, doc_version, tbinfo->field_value, docid_index_map);
if (0 != ret) {
roll_back();
return ret;
}
word_map.clear();
}else{
log_error("field type error, not FIELD_STRING.");
return RT_ERROR_FIELD_FORMAT;
}
break;
case FIELD_INT:
if(json_field[field_name].isInt()){
int ret;
struct item it;
it.doc_id = doc_id;
it.freq = 0;
string key = "";
if(tbinfo->segment_tag == SEGMENT_RANGE){ // 范围查的字段将key补全到20位
stringstream ss;
ss << setw(20) << setfill('0') << json_field[field_name].asInt();
key = gen_dtc_key_string(app_id, "00", ss.str());
} else {
key = gen_dtc_key_string(app_id, "00", (uint32_t)json_field[field_name].asInt());
}
ret = g_IndexInstance.insert_index_dtc(key, it, tbinfo->field_value, doc_version, docid_index_map);
if(ret != 0){
roll_back();
return RT_ERROR_INSERT_INDEX_DTC;
}
}else{
log_error("field type error, not FIELD_INT.");
return RT_ERROR_FIELD_FORMAT;
}
break;
case FIELD_LONG:
if(json_field[field_name].isInt64()){
struct item it;
it.doc_id = doc_id;
it.freq = 0;
string key = gen_dtc_key_string(app_id, "00", (int64_t)json_field[field_name].asInt64());
int ret = g_IndexInstance.insert_index_dtc(key, it, tbinfo->field_value, doc_version, docid_index_map);
if(0 != ret){
roll_back();
log_error("insert_index_dtc error, appid[%d], key[%s]", app_id, key.c_str());
return RT_ERROR_INSERT_INDEX_DTC;
}
} else {
log_error("field type error, not FIELD_LONG.");
return RT_ERROR_FIELD_FORMAT;
}
break;
case FIELD_DOUBLE:
if(json_field[field_name].isDouble()){
struct item it;
it.doc_id = doc_id;
it.freq = 0;
string key = gen_dtc_key_string(app_id, "00", (double)json_field[field_name].asDouble());
int ret = g_IndexInstance.insert_index_dtc(key, it, tbinfo->field_value, doc_version, docid_index_map);
if(0 != ret){
roll_back();
log_error("insert_index_dtc error, appid[%d], key[%s]", app_id, key.c_str());
return RT_ERROR_INSERT_INDEX_DTC;
}
} else {
log_error("field type error, not FIELD_DOUBLE.");
return RT_ERROR_FIELD_FORMAT;
}
break;
case FIELD_IP:
uint32_t s;
int ret;
if(json_field[field_name].isString()){
ret = inet_pton(AF_INET, json_field[field_name].asString().c_str(), (void *)&s);
if(ret == 0){
log_error("ip format is error\n");
return RT_ERROR_FIELD_FORMAT;
}
struct item it;
it.doc_id = doc_id;
it.freq = 0;
string key = gen_dtc_key_string(app_id, "00", ntohl(s));
ret = g_IndexInstance.insert_index_dtc(key, it, tbinfo->field_value, doc_version, docid_index_map);
if(ret != 0){
roll_back();
return RT_ERROR_INSERT_INDEX_DTC;
}
}else{
return RT_ERROR_FIELD_FORMAT;
}
break;
case FIELD_LNG:
if(json_field[field_name].isString()){
lng = json_field[field_name].asString();
}else{
return RT_ERROR_FIELD_FORMAT;
}
break;
case FIELD_LAT:
if(json_field[field_name].isString()){
lat = json_field[field_name].asString();
}else{
return RT_ERROR_FIELD_FORMAT;
}
break;
case FIELD_LNG_ARRAY:
if(json_field[field_name].isArray()){
Json::Value lngs = json_field[field_name];
for (uint32_t lng_idx = 0; lng_idx < lngs.size(); ++lng_idx) {
if (lngs[lng_idx].isString()){
lng_arr.push_back(lngs[lng_idx].asString());
} else {
log_error("longitude must be string");
return RT_ERROR_FIELD_FORMAT;
}
}
}else{
log_error("FIELD_LNG_ARRAY must be array");
return RT_ERROR_FIELD_FORMAT;
}
break;
case FIELD_LAT_ARRAY:
if(json_field[field_name].isArray()){
Json::Value lats = json_field[field_name];
for (uint32_t lat_idx = 0; lat_idx < lats.size(); ++lat_idx) {
if (lats[lat_idx].isString()){
lat_arr.push_back(lats[lat_idx].asString());
} else {
log_error("latitude must be string");
return RT_ERROR_FIELD_FORMAT;
}
}
}else{
log_error("FIELD_LAT_ARRAY must be array");
return RT_ERROR_FIELD_FORMAT;
}
break;
case FIELD_WKT:
if(json_field[field_name].isString()){
string str = json_field[field_name].asString();
str = delPrefix(str);
vector<string> str_vec = splitEx(str, ",");
for(uint32_t str_vec_idx = 0; str_vec_idx < str_vec.size(); str_vec_idx++){
string wkt_str = trim(str_vec[str_vec_idx]);
vector<string> wkt_vec = splitEx(wkt_str, " ");
if(wkt_vec.size() == 2){
lng_arr.push_back(wkt_vec[0]);
lat_arr.push_back(wkt_vec[1]);
}
}
} else {
log_error("FIELD_WKT must be string");
return RT_ERROR_FIELD_FORMAT;
}
break;
default:
break;
}
return 0;
}
int AddReqProc::do_insert_index(UserTableContent& content_fields){
int ret = 0;
Json::Value::Members member = json_field.getMemberNames();
Json::Value::Members::iterator iter = member.begin();
for(; iter != member.end(); ++iter)
{
string field_name = *iter;
struct table_info *tbinfo = NULL;
tbinfo = SplitManager::Instance()->get_table_info(app_id, field_name);
if(tbinfo == NULL){
continue;
}
if(tbinfo->snapshot_tag == 1){ //snapshot
if(tbinfo->field_type == 1 && json_field[field_name].isInt()){
snapshot_content[field_name] = json_field[field_name].asInt();
}else if(tbinfo->field_type > 1 && json_field[field_name].isString()){
snapshot_content[field_name] = json_field[field_name].asString();
}else if(tbinfo->field_type > 1 && json_field[field_name].isDouble()){
snapshot_content[field_name] = json_field[field_name].asDouble();
}else if(tbinfo->field_type > 1 && json_field[field_name].isInt64()){
snapshot_content[field_name] = json_field[field_name].asInt64();
}else if(tbinfo->field_type > 1 && json_field[field_name].isArray()){
snapshot_content[field_name] = json_field[field_name];
}
}
}
for(iter = member.begin(); iter != member.end(); ++iter)
{
string field_name = *iter;
struct table_info *tbinfo = NULL;
tbinfo = SplitManager::Instance()->get_table_info(app_id, field_name);
if(tbinfo == NULL){
continue;
}
if(tbinfo->index_tag == 1){
ret = deal_index_tag(tbinfo, field_name);
if(0 != ret){
log_error("deal index tag process error, ret: %d", ret);
roll_back();
return ret;
}
}
}
if(lng.length() != 0 && lat.length() != 0){
struct table_info *tbinfo = NULL;
tbinfo = SplitManager::Instance()->get_table_info(app_id, "gis");
if(tbinfo == NULL){
roll_back();
return RT_NO_GIS_DEFINE;
}
string gisid = encode(atof(lat.c_str()), atof(lng.c_str()), 6);
log_debug("gis code = %s",gisid.c_str());
int ret;
uint64_t id = 0;
struct item it;
it.doc_id = doc_id;
it.freq = 0;
Json::FastWriter gis_writer;
it.extend = gis_writer.write(snapshot_content);
string key = gen_dtc_key_string(app_id, "00", gisid);
ret = g_IndexInstance.insert_index_dtc(key, it, tbinfo->field_value, doc_version, docid_index_map);
if(ret != 0){
roll_back();
return RT_ERROR_INSERT_INDEX_DTC;
}
log_debug("id = %llu,doc_vesion = %d,docid = %s\n",(long long unsigned int)id,doc_version,it.doc_id.c_str());
}
log_debug("lng_arr size: %d, lat_arr size: %d", (int)lng_arr.size(), (int)lat_arr.size());
if(lng_arr.size() > 0 && lat_arr.size() > 0){
if(lng_arr.size() != lat_arr.size()){
log_error("lng_arr size not equal with lat_arr size");
return RT_ERROR_FIELD_FORMAT;
}
set<string> gis_set;
for(uint32_t arr_idx = 0; arr_idx < lng_arr.size(); arr_idx++){
string tmp_lng = lng_arr[arr_idx];
string tmp_lat = lat_arr[arr_idx];
struct table_info *tbinfo = NULL;
tbinfo = SplitManager::Instance()->get_table_info(app_id, "gis");
if(tbinfo == NULL){
roll_back();
log_error("gis field not defined");
return RT_NO_GIS_DEFINE;
}
string gisid = encode(atof(tmp_lat.c_str()), atof(tmp_lng.c_str()), 6);
if(gis_set.find(gisid) != gis_set.end()){
continue;
}
gis_set.insert(gisid);
struct item it;
it.doc_id = doc_id;
it.freq = 0;
Json::FastWriter gis_writer;
it.extend = gis_writer.write(snapshot_content);
string key = gen_dtc_key_string(app_id, "00", gisid);
int ret = g_IndexInstance.insert_index_dtc(key, it, tbinfo->field_value, doc_version, docid_index_map);
if(ret != 0){
roll_back();
return RT_ERROR_INSERT_INDEX_DTC;
}
log_debug("gis code = %s,doc_vesion = %d,docid = %s\n",gisid.c_str(),doc_version,it.doc_id.c_str());
}
}
vector<string> union_key_vec;
SplitManager::Instance()->getUnionKeyField(app_id, union_key_vec);
vector<string>::iterator union_key_iter = union_key_vec.begin();
for(; union_key_iter != union_key_vec.end(); union_key_iter++){
string union_key = *union_key_iter;
vector<int> union_field_vec = splitInt(union_key, ",");
vector<int>::iterator union_field_iter = union_field_vec.begin();
vector<vector<string> > keys_vvec;
for(; union_field_iter != union_field_vec.end(); union_field_iter++){
int union_field_value = *union_field_iter;
if(union_field_value >= (int)docid_index_map.size()){
log_error("appid[%d] field[%d] is invalid", app_id, *union_field_iter);
break;
}
vector<string> key_vec;
if(!docid_index_map[union_field_value].isArray()){
log_debug("doc_id[%s] union_field_value[%d] has no keys", doc_id.c_str(), union_field_value);
break;
}
for (int key_index = 0; key_index < (int)docid_index_map[union_field_value].size(); key_index++){
if(docid_index_map[union_field_value][key_index].isString()){
string union_index_key = docid_index_map[union_field_value][key_index].asString();
if(union_index_key.size() > 9){ // 倒排key的格式为10061#00#折扣,这里只取第二个#后面的内容
key_vec.push_back(union_index_key.substr(9));
}
}
}
keys_vvec.push_back(key_vec);
}
if(keys_vvec.size() != union_field_vec.size()){
log_debug("keys_vvec.size not equal union_field_vec.size");
break;
}
vector<string> union_keys = combination(keys_vvec);
for(int m = 0 ; m < (int)union_keys.size(); m++){
ret = g_IndexInstance.insert_union_index_dtc(union_keys[m], doc_id, app_id, doc_version);
if(ret != 0){
log_error("insert union key[%s] error", union_keys[m].c_str());
}
}
}
Json::FastWriter writer;
content_fields.content = writer.write(snapshot_content);
Json::FastWriter doc_index_writer;
string doc_index_map_string = doc_index_writer.write(docid_index_map);
if(doc_version != 1){//need update
map<uint32_t, vector<string> > index_res;
g_IndexInstance.GetIndexData(gen_dtc_key_string(content_fields.appid, "20", doc_id), doc_version - 1, index_res);
map<uint32_t, vector<string> >::iterator map_iter = index_res.begin();
for(; map_iter != index_res.end(); map_iter++){
uint32_t field = map_iter->first;
vector<string> words = map_iter->second;
for(int i = 0; i < (int)words.size(); i++){
DeleteTask::GetInstance().RegisterInfo(words[i], doc_id, doc_version - 1, field);
}
}
int affected_rows = 0;
ret = g_IndexInstance.update_sanpshot_dtc(content_fields, doc_version, trans_version, affected_rows);
if(ret != 0 || affected_rows == 0){
log_error("update_sanpshot_dtc error, roll back, ret: %d, affected_rows: %d.", ret, affected_rows);
roll_back();
return RT_ERROR_UPDATE_SNAPSHOT;
}
g_IndexInstance.update_docid_index_dtc(doc_index_map_string, doc_id, app_id, doc_version);
}else{
int affected_rows = 0;
ret = g_IndexInstance.update_sanpshot_dtc(content_fields, doc_version, trans_version, affected_rows);
if(ret != 0 || affected_rows == 0){
log_error("update_sanpshot_dtc error, roll back, ret: %d, affected_rows: %d.", ret, affected_rows);
roll_back();
return RT_ERROR_UPDATE_SNAPSHOT;
}
g_IndexInstance.insert_docid_index_dtc(doc_index_map_string, doc_id, app_id, doc_version);
}
return 0;
}
int AddReqProc::roll_back(){
// 删除hanpin_index
for(int i = 0; i < (int)intelligent_keys.size(); i++){
g_hanpinIndexInstance.delete_intelligent(intelligent_keys[i], doc_id, trans_version);
}
// 删除keyword_index
if(docid_index_map.isArray()){
for(int i = 0;i < (int)docid_index_map.size();i++){
Json::Value info = docid_index_map[i];
if(info.isArray()){
for(int j = 0;j < (int)info.size();j++){
if(info[j].isString()){
string key = info[j].asString();
g_IndexInstance.delete_index(key, doc_id, trans_version, i);
}
}
}
}
}
// 如果trans_version=1删除快照否则更新快照的trans_version=trans_version-1
Json::Value res;
if(trans_version == 1){
g_IndexInstance.delete_snapshot_dtc(doc_id, app_id, res);
} else {
g_IndexInstance.update_sanpshot_dtc(app_id, doc_id, trans_version);
}
return 0;
}

View File

@ -0,0 +1,58 @@
/*
* =====================================================================================
*
* Filename: add_request_proc.h
*
* Description: AddReqProc class definition.
*
* Version: 1.0
* Created: 09/08/2020 10:02:05 PM
* Revision: none
* Compiler: gcc
*
* Author: shrewdlin, linjinming@jd.com
* Company: JD.com, Inc.
*
* =====================================================================================
*/
#ifndef ADD_REQUEST_PROC_H
#define ADD_REQUEST_PROC_H
#include "log.h"
#include "json/json.h"
#include "comm.h"
class UserTableContent;
class SplitManager;
class AddReqProc
{
public:
AddReqProc();
AddReqProc(const Json::Value& jf, InsertParam& insert_param);
~AddReqProc();
int do_insert_index(UserTableContent& content_fields);
private:
void do_stat_word_freq(vector<vector<string> > &strss, map<string, item> &word_map, string extend);
void do_stat_word_freq(vector<string> &strss, map<string, item> &word_map);
int deal_index_tag(struct table_info *tbinfo, string field_name);
int roll_back();
private:
Json::Value json_field;
uint32_t app_id;
uint32_t doc_version;
uint32_t trans_version;
string doc_id;
string lng;
string lat;
vector<string> lng_arr;
vector<string> lat_arr;
vector<string> intelligent_keys;
Json::Value snapshot_content;
Json::Value docid_index_map;
};
#endif

View File

@ -0,0 +1,121 @@
/*
* =====================================================================================
*
* Filename: comm.h
*
* Description: common enumeration classes definition.
*
* Version: 1.0
* Created: 09/08/2020 10:02:05 PM
* Revision: none
* Compiler: gcc
*
* Author: shrewdlin, linjinming@jd.com
* Company: JD.com, Inc.
*
* =====================================================================================
*/
#ifndef __COMM_H__
#define __COMM_H__
#include <string>
#include <stdint.h>
#include <vector>
using namespace std;
#define BUILD_BIGINT_KEY(a,b) ((((unsigned long long)(a)) << 32)&0xffffffff00000000ll) | (b);
#define KETTOAPPID(a) (((unsigned long long)(a))>>32)&0xFFFFFFFF
#define MESSAGE "message"
struct item {
string doc_id;
uint32_t freq;
vector<uint32_t> indexs;
string extend;
};
struct InsertParam{
uint32_t appid;
string doc_id;
uint32_t doc_version;
uint32_t trans_version;
};
enum CHARACTERTYPE {
CHINESE = 1,
INITIAL = 2,
WHOLE_SPELL = 3,
};
enum FieldType{
FIELD_INT = 1,
FIELD_STRING,
FIELD_TEXT,
FIELD_IP,
FIELD_LNG,
FIELD_LAT,
FIELD_GIS,
FIELD_DISTANCE,
FIELD_DOUBLE,
FIELD_LONG,
FIELD_INDEX = 11,
FIELD_LNG_ARRAY,
FIELD_LAT_ARRAY,
FIELD_WKT,
};
enum SEGMENTTAG {
SEGMENT_DEFAULT = 1,
SEGMENT_NGRAM = 2,
SEGMENT_CHINESE = 3,
SEGMENT_ENGLISH = 4,
SEGMENT_RANGE = 5,
};
enum SegmentFeature
{
SEGMENT_FEATURE_DEFAULT = 0, // 默认值,只支持前缀模糊匹配
SEGMENT_FEATURE_ALLLOCATE = 1, // 支持任意位置的模糊匹配
SEGMENT_FEATURE_SNAPSHOT = 2, // 该字段的倒排索引中extend字段需带上快照信息
};
enum CmdType {
CMD_INDEX_GEN = 106,
CMD_TOP_INDEX = 107,
CMD_SNAPSHOT = 108,
CMD_IMAGE_REPORT = 109,
};
enum RetCode{
RT_CMD_ADD=10000,
RT_CMD_UPDATE,
RT_CMD_GET,
RT_CMD_DELETE,
RT_PARSE_JSON_ERR = 20001,
RT_PARSE_CONF_ERR,
RT_INIT_ERR,
RT_NO_TABLE_CONTENT,
RT_NO_FIELD_COUNT,
RT_NO_APPID,
RT_NO_DOCID,
RT_NO_GIS_DEFINE,
RT_ERROR_FIELD_COUNT,
RT_ERROR_FIELD_CMD,
RT_ERROR_FIELD,
RT_ERROR_SERVICE_TYPE,
RT_ERROR_GET_SNAPSHOT,
RT_ERROR_DELETE_SNAPSHOT,
RT_ERROR_UPDATE_SNAPSHOT,
RT_ERROR_INSERT_SNAPSHOT,
RT_ERROR_INSERT_TOP_INDEX_DTC,
RT_ERROR_INSERT_INDEX_DTC,
RT_ERROR_INVALID_SP_WORD,
RT_ERROR_FIELD_FORMAT,
RT_ERROR_GET_GISCODE,
RT_NO_THIS_DOC,
RT_UPDATE_SNAPSHOT_CONFLICT,
RT_ERROR_INDEX_READONLY
};
#endif

View File

@ -0,0 +1,52 @@
{
"program_name" : "index_write v1.0",
"pid_file" : "index_write.pid",
"log" : "../log/",
"log_level" : 7,
"daemon": true,
"listen_addr": "*:11017/tcp",
"timeout": 6000,
"stop_words_path":"../conf/stop_words.dict",
"training_path":"../conf/msr_training.utf8",
"words_base_path":"../conf/words_base.dict",
"words_file":"../conf/words_base.txt",
"character_path":"../conf/character_map.txt",
"phonetic_path":"../conf/phonetic_map.txt",
"phonetic_base_file" : "../conf/phonetic_base.txt",
"service_type":"index_gen",
"dtc_index_config" :
{
"table_name": "keyword_index_data",
"accesskey": "000020942f22577f38c66a20d8cb8ba30cbb3d75",
"timeout": 4000,
"keytype": 4,
"route":
[
{
"ip": "127.0.0.1",
"bid": 2094,
"port": 20000,
"weight": 1,
"status": 1
}
]
},
"dtc_intelligent_config" :
{
"table_name": "hanpin_index_data",
"accesskey": "000020915b27ecebfbb0dfa6e4cf32397c2bf7be",
"timeout": 4000,
"keytype": 4,
"route":
[
{
"ip": "127.0.0.1",
"bid": 2091,
"port": 20001,
"weight": 1,
"status": 1
}
]
},
"split_mode":"Post"
}

View File

@ -0,0 +1,434 @@
/*
* =====================================================================================
*
* Filename: dtc_tools.cc
*
* Description: DTCTools class definition.
*
* Version: 1.0
* Created: 09/08/2020 10:02:05 PM
* Revision: none
* Compiler: gcc
*
* Author: shrewdlin, linjinming@jd.com
* Company: JD.com, Inc.
*
* =====================================================================================
*/
#include "dtc_tools.h"
#include "split_manager.h"
#include "log.h"
#include "comm.h"
#include <iostream>
#include <sstream>
#include <string.h>
string initial_table[] = { "b","p","m","f","d","t","n","l","g","k","h","j","q","x","zh","ch","sh","r","z","c","s","y","w" };
int DTCTools::init_servers(DTC::DTCServers &servers, SDTCHost &dtc_config)
{
int ret = 0;
ret = servers.SetTableName(dtc_config.szTablename.c_str());
if (0 != ret)
{
cout << "SetTableName error !\n";
return ret;
}
std::vector<DTC::ROUTE_NODE> list;
for(std::vector<SDTCroute>::const_iterator route_elem = dtc_config.vecRoute.begin(); route_elem != dtc_config.vecRoute.end(); route_elem++)
{
DTC::ROUTE_NODE route;
route.bid = route_elem->uBid;
route.port = route_elem->uPort;
route.status = route_elem->uStatus;
route.weight = route_elem->uWeight;
memcpy(route.ip,route_elem->szIpadrr.c_str(), strlen(route_elem->szIpadrr.c_str()));
route.ip[strlen(route_elem->szIpadrr.c_str())] = '\0';
list.push_back(route);
}
ret = servers.SetRouteList(list);
if (0 != ret) {
cout << "SetRouteList error!\n";
return ret;
}
servers.SetMTimeout(dtc_config.uTimeout);
ret = servers.SetAccessKey(dtc_config.szAccesskey.c_str());
if (0 != ret)
{
cout << "SetAccessKey error !\n";
return ret;
}
ret = servers.SetKeyType(dtc_config.uKeytype);
if (0 != ret)
{
cout << "SetKeyType error !\n";
return ret;
}
return ret;
}
int DTCTools::init_dtc_server(DTC::Server &server, const char *ip_str, const char *dtc_port, SDTCHost &dtc_config)
{
int ret = 0;
ret = server.SetTableName(dtc_config.szTablename.c_str());
if (0 != ret)
{
cout << "SetTableName error !\n";
return ret;
}
server.SetAddress(ip_str, dtc_port);
server.SetMTimeout(dtc_config.uTimeout);
if(1 == dtc_config.uKeytype || 2 == dtc_config.uKeytype)
server.IntKey();
else
server.StringKey();
return ret;
}
bool DTCTools::insert_dtc_server(u_int64_t ip_port_key,const char *ip_str,const char *port_str,SDTCHost &dtc_config){
DTC::Server s;
init_dtc_server(s,ip_str,port_str,dtc_config);
dtc_handle.insert(make_pair(ip_port_key,s));
return true;
}
string gen_dtc_key_string(uint32_t appid, string type, string key) {
stringstream ss;
ss << appid << "#" << type << "#" << key;
return ss.str();
}
string gen_dtc_key_string(uint32_t appid, string type, uint32_t key) {
stringstream ss;
ss << appid << "#" << type << "#" << key;
return ss.str();
}
string gen_dtc_key_string(uint32_t appid, string type, int64_t key) {
stringstream ss;
ss << appid << "#" << type << "#" << key;
return ss.str();
}
string gen_dtc_key_string(uint32_t appid, string type, double key) {
stringstream ss;
ss << appid << "#" << type << "#" << key;
return ss.str();
}
void split_func(string pinyin, string &split_str) {
int i = 0;
stringstream result;
for (i = 0; i < (int)pinyin.size(); i++)
{
if (strchr("aeiouv", pinyin.at(i)))
{
result << pinyin.at(i);
continue;
}
else
{
if (pinyin.at(i) != 'n') //不是n从该辅音前分开
{
if (i == 0)
{
result << pinyin.at(i);
}
else
{
result << ' ' << pinyin.at(i);
}
if ((i + 1) < (int)pinyin.size() && (pinyin.at(i) == 'z' || pinyin.at(i) == 'c' || pinyin.at(i) == 's') &&
(pinyin.at(i + 1) == 'h'))
{
result << 'h';
i++;
}
continue;
}
else //是n,继续向后
{
if (i == (int)pinyin.size() - 1)
{
result << pinyin.at(i);
continue;
}
else
i++; //继续向后
if (strchr("aeiouv", pinyin.at(i))) //如果是元音,从n前分开
{
if (i == 1)
{
result << 'n' << pinyin.at(i);
continue;
}
else
{
result << ' ' << 'n' << pinyin.at(i);
continue;
}
}
//如果是辅音字母
else
{
if (pinyin.at(i) == 'g')
{
if (i == (int)pinyin.size() - 1)
{
result << 'n' << pinyin.at(i);
continue;
}
else
i++; //继续向后
if (strchr("aeiouv", pinyin.at(i)))
{
result << 'n' << ' ' << 'g' << pinyin.at(i);
continue;
}
else
{
result << 'n' << 'g' << ' ' << pinyin.at(i);
if ((i + 1) < (int)pinyin.size() && (pinyin.at(i) == 'z' || pinyin.at(i) == 'c' || pinyin.at(i) == 's') &&
(pinyin.at(i + 1) == 'h'))
{
result << 'h';
i++;
}
continue;
}
}
else //不是g的辅音字母,从n后分开
{
result << 'n' << ' ' << pinyin.at(i);
if ((i + 1) < (int)pinyin.size() && (pinyin.at(i) == 'z' || pinyin.at(i) == 'c' || pinyin.at(i) == 's') &&
(pinyin.at(i + 1) == 'h'))
{
result << 'h';
i++;
}
continue;
}
}
}
}
}
split_str = result.str();
}
void convert_intelligent_alpha_num(const vector<Content> &result, vector<IntelligentInfo> &info_vec, bool &flag) {
int i = 0;
flag = true;
IntelligentInfo basic_info;
vector<Content>::const_iterator content_iter = result.begin();
for (; content_iter != result.end(); content_iter++, i++) {
if (i >= 16) {
log_info("content length[%d] must be less than 16", (int)result.size());
break;
}
basic_info.initial_char[i] = ((*content_iter).str)[0];
}
info_vec.push_back(basic_info);
}
void convert_intelligent(const vector<Content> &result, vector<IntelligentInfo> &info_vec, bool &flag) {
int i = 0;
flag = true;
IntelligentInfo basic_info;
vector<vector<string> > phonetic_id_vecs;
vector<uint32_t> length_vec;
vector<Content>::const_iterator content_iter = result.begin();
for (; content_iter != result.end(); content_iter++, i++) {
if (i >= 8) {
log_info("content length[%d] must be less than 8", (int)result.size());
break;
}
uint32_t charact_id = 0;
uint32_t phonetic_id = 0;
vector<string> phonetic_id_vec;
if ((*content_iter).type == CHINESE) { // 查找字id
SplitManager::Instance()->GetCharactId((*content_iter).str, charact_id);
basic_info.charact_id[i] = charact_id;
vector<string> vec = SplitManager::Instance()->GetPhonetic((*content_iter).str);
if (vec.size() == 1) {
phonetic_id_vec.push_back(vec[0]);
}
else if (vec.size() > 1) { // 多音字
int j = 0;
for (; j < (int)vec.size(); j++) {
SplitManager::Instance()->GetPhoneticId(vec[j], phonetic_id);
phonetic_id_vec.push_back(vec[j]);
}
}
phonetic_id_vecs.push_back(phonetic_id_vec);
length_vec.push_back(phonetic_id_vec.size());
}
else {
basic_info.initial_char[i] = (*content_iter).str[0];
}
}
/*
*
* 4
* chongchuan chongzhuan zhongchuan zhongzhuan
*
*
* int factor[3][4] =
* {
* {0, 1, 2, 3},
* {0, 1},
* {0, 1, 2},
* };
* 3x2x4的24种组合理解为[0-2] [0-1] [0-3]
* 使0 - 23
*
* 0 - 2324
*/
i = 0;
int j = 0;
int k = 0;
int len = 0;
int len_num = 0;
int totalLength = 1;
uint32_t phonetic_id = 0;
int colum = phonetic_id_vecs.size();
for (i = 0; i < colum; i++)
{
totalLength *= length_vec[i];
}
for (i = 0; i < totalLength; i++) {
k = i;
len_num = 0;
IntelligentInfo info = basic_info;
for (j = 0; j < colum; j++) {
len = length_vec[len_num];
string phonetic = phonetic_id_vecs[j][k % len];
SplitManager::Instance()->GetPhoneticId(phonetic, phonetic_id);
info.phonetic_id[j] = phonetic_id;
if (phonetic.size() > 1) {
info.initial_char[j] = phonetic[0];
}
k = k / len;
len_num++;
}
info_vec.push_back(info);
}
if (info_vec.size() == 0 && phonetic_id_vecs.size() == 0) {
info_vec.push_back(basic_info);
}
}
void get_intelligent(string str, vector<IntelligentInfo> &info_vec, bool &flag) {
vector<Content> result;
set<string> initial_vec(initial_table, initial_table + 23);
iutf8string utf8_str(str);
int i = 0;
if (noChinese(str)) {
for (; i < (int)str.length(); i++) {
Content content;
content.str = str[i];
content.type = INITIAL;
result.push_back(content);
}
convert_intelligent_alpha_num(result, info_vec, flag);
}
else{
for (; i < utf8_str.length(); ) {
if (utf8_str[i].size() > 1) {
Content content;
content.type = CHINESE;
content.str = utf8_str[i];
result.push_back(content);
i++;
}
else {
Content content;
content.type = INITIAL;
content.str = utf8_str[i];
result.push_back(content);
i++;
}
}
convert_intelligent(result, info_vec, flag);
}
}
bool noChinese(string str) {
iutf8string utf8_str(str);
if (utf8_str.length() == (int)str.length()) {
return true;
}
else {
return false;
}
}
bool allChinese(string str) {
iutf8string utf8_str(str);
for (int i = 0; i < utf8_str.length(); i++) {
if (utf8_str[i].length() == 1) {
return false;
}
}
return true;
}
/*
** vector每一维vector中取一个数的各种组合
** [[a],[b1,b2],[c1,c2,c3]]
** [a_b1_c1,a_b1_c2,a_b1_c3,a_b2_c1,a_b2_c2,a_b2_c3]
*/
vector<string> combination(vector<vector<string> > &dimensionalArr){
int FLength = dimensionalArr.size();
if(FLength >= 2){
int SLength1 = dimensionalArr[0].size();
int SLength2 = dimensionalArr[1].size();
int DLength = SLength1 * SLength2;
vector<string> temporary(DLength);
int index = 0;
for(int i = 0; i < SLength1; i++){
for (int j = 0; j < SLength2; j++) {
temporary[index] = dimensionalArr[0][i] +"_"+ dimensionalArr[1][j];
index++;
}
}
vector<vector<string> > new_arr;
new_arr.push_back(temporary);
for(int i = 2; i < (int)dimensionalArr.size(); i++){
new_arr.push_back(dimensionalArr[i]);
}
return combination(new_arr);
} else {
return dimensionalArr[0];
}
}
vector<int> splitInt(const string& src, string separate_character)
{
vector<int> strs;
//分割字符串的长度,这样就可以支持如“,,”多字符串的分隔符
int separate_characterLen = separate_character.size();
int lastPosition = 0, index = -1;
string str;
int pos = 0;
while (-1 != (index = src.find(separate_character, lastPosition)))
{
if (src.substr(lastPosition, index - lastPosition) != " ") {
str = src.substr(lastPosition, index - lastPosition);
pos = atoi(str.c_str());
strs.push_back(pos);
}
lastPosition = index + separate_characterLen;
}
string lastString = src.substr(lastPosition);//截取最后一个分隔符后的内容
if (!lastString.empty() && lastString != " "){
pos = atoi(lastString.c_str());
strs.push_back(pos);//如果最后一个分隔符后还有内容就入队
}
return strs;
}

View File

@ -0,0 +1,90 @@
/*
* =====================================================================================
*
* Filename: dtc_tools.h
*
* Description: DTCTools class definition.
*
* Version: 1.0
* Created: 09/08/2020 10:02:05 PM
* Revision: none
* Compiler: gcc
*
* Author: shrewdlin, linjinming@jd.com
* Company: JD.com, Inc.
*
* =====================================================================================
*/
#ifndef SRC_INDEX_GEN_DTC_TOOLS_H_
#define SRC_INDEX_GEN_DTC_TOOLS_H_
#include "index_conf.h"
#include "dtcapi.h"
#include <map>
class DTCTools{
public:
DTCTools(){
}
static DTCTools *Instance()
{
return CSingleton<DTCTools>::Instance();
}
static void Destroy()
{
CSingleton<DTCTools>::Destroy();
}
int init_servers(DTC::DTCServers &servers, SDTCHost &dtc_config);
int init_dtc_server(DTC::Server &server, const char *ip_str, const char *dtc_port,SDTCHost &dtc_config);
DTC::Server *find_dtc_server(u_int64_t ip_port_key){
if(dtc_handle.find(ip_port_key) != dtc_handle.end()){
return &dtc_handle[ip_port_key];
}else{
return NULL;
}
}
bool insert_dtc_server(u_int64_t ip_port_key, const char *ip_str, const char *port_str, SDTCHost &dtc_config);
private:
map<u_int64_t, DTC::Server> dtc_handle;
};
struct IntelligentInfo {
IntelligentInfo() {
int i = 0;
for (; i < 8; i++) {
charact_id[i] = 0;
}
for (i = 0; i < 8; i++) {
phonetic_id[i] = 0;
}
for (i = 0; i < 16; i++) {
initial_char[i] = "";
}
}
uint16_t charact_id[8];
uint16_t phonetic_id[8];
string initial_char[16];
};
struct Content {
uint32_t type;
string str;
};
string gen_dtc_key_string(uint32_t appid, string type, string key);
string gen_dtc_key_string(uint32_t appid, string type, uint32_t key);
string gen_dtc_key_string(uint32_t appid, string type, int64_t key);
string gen_dtc_key_string(uint32_t appid, string type, double key);
void split_func(string pinyin, string &split_str);
void get_intelligent(string str, vector<IntelligentInfo> &info_vec, bool &flag);
void convert_intelligent(const vector<Content> &result, vector<IntelligentInfo> &info_vec, bool &flag);
void convert_intelligent_alpha_num(const vector<Content> &result, vector<IntelligentInfo> &info_vec, bool &flag);
vector<string> combination(vector<vector<string> > &dimensionalArr);
vector<int> splitInt(const string& src, string separate_character);
bool noChinese(string str);
bool allChinese(string str);
#endif /* SRC_INDEX_GEN_DTC_TOOLS_H_ */

View File

@ -0,0 +1,474 @@
/*
* =====================================================================================
*
* Filename: image_service.cc
*
* Description: CTaskImage class definition.
*
* Version: 1.0
* Created: 09/08/2020 10:02:05 PM
* Revision: none
* Compiler: gcc
*
* Author: shrewdlin, linjinming@jd.com
* Company: JD.com, Inc.
*
* =====================================================================================
*/
#include "image_service.h"
#include <iostream>
#include <string>
#include <map>
#include "log.h"
#include "poll_thread.h"
#include "task_request.h"
#include "dtc_tools.h"
#include "comm.h"
#include "index_clipping.h"
#include "monitor.h"
#include "chash.h"
CTaskImage::CTaskImage(CPollThread * o) :
CTaskDispatcher<CTaskRequest>(o),
ownerThread(o),
output(o)
{
}
CTaskImage::~CTaskImage()
{
}
int CTaskImage::insert_snapshot_dtc(const UserTableContent &fields,int &doc_version,Json::Value &res){
int ret;
DTC::Server* dtc_server = index_servers.GetServer();
if(NULL == dtc_server){
log_error("snapshot server connect error!");
return RT_ERROR_GET_SNAPSHOT;
}
DTC::InsertRequest insertReq(dtc_server);
insertReq.SetKey(gen_dtc_key_string(fields.appid, "10", fields.doc_id).c_str());
insertReq.Set("doc_id", fields.doc_id.c_str());
insertReq.Set("doc_version", doc_version);
insertReq.Set("extend", fields.content.c_str());
insertReq.Set("field", fields.top);
insertReq.Set("weight", fields.weight);
insertReq.Set("created_time", fields.publish_time);
insertReq.Set("word_freq", 0);
insertReq.Set("location", "");
insertReq.Set("start_time", 0);
insertReq.Set("end_time", 0);
DTC::Result rst;
ret = insertReq.Execute(rst);
if (ret != 0)
{
log_error("insert request error! ,errno %d ,errmsg %s, errfrom %s\n", ret, rst.ErrorMessage(), rst.ErrorFrom());
return -1;
}
return 0;
}
int CTaskImage::delete_snapshot_dtc(string &doc_id,uint32_t appid,Json::Value &res){
int ret;
DTC::Server* dtc_server = index_servers.GetServer();
if(NULL == dtc_server){
log_error("snapshot server connect error!");
return RT_ERROR_GET_SNAPSHOT;
}
DTC::DeleteRequest deleteReq(dtc_server);
ret = deleteReq.SetKey(gen_dtc_key_string(appid, "10", doc_id).c_str());
DTC::Result rst;
ret = deleteReq.Execute(rst);
if (ret != 0)
{
log_error("delete request error! ,errno %d ,errmsg %s, errfrom %s\n", ret, rst.ErrorMessage(), rst.ErrorFrom());
return RT_ERROR_DELETE_SNAPSHOT;
}
return 0;
}
static int get_snapshot_execute(DTC::Server* dtc_server,const UserTableContent &fields,DTC::Result &rst){
DTC::GetRequest getReq(dtc_server);
int ret = 0;
ret = getReq.SetKey(gen_dtc_key_string(fields.appid, "10", fields.doc_id).c_str());
ret = getReq.Need("doc_version");
ret = getReq.Execute(rst);
return ret;
}
int CTaskImage::get_snapshot_active_doc(const UserTableContent &fields,int &doc_version,Json::Value &res){
int ret;
DTC::Server* dtc_server = index_servers.GetServer();
if(NULL == dtc_server){
log_error("snapshot server connect error!");
return RT_ERROR_GET_SNAPSHOT;
}
DTC::Result rst;
ret = get_snapshot_execute(dtc_server,fields,rst);
if (ret != 0) {
if (ret == -110) {
rst.Reset();
ret = get_snapshot_execute(dtc_server,fields,rst);
if (ret != 0) {
log_error("get request error! errcode %d,errmsg %s, errfrom %s", ret, rst.ErrorMessage(), rst.ErrorFrom());
return RT_ERROR_GET_SNAPSHOT;
}
}
else {
log_error("get request error! errcode %d,errmsg %s, errfrom %s", ret, rst.ErrorMessage(), rst.ErrorFrom());
return RT_ERROR_GET_SNAPSHOT;
}
}
int cnt = rst.NumRows();
struct index_item item;
if (rst.NumRows() <= 0) {
return RT_NO_THIS_DOC;
}
else {
for (int i = 0; i < cnt; i++) {
rst.FetchRow();
doc_version = rst.IntValue("doc_version");
}
}
return 0;
}
static int insert_index_execute(DTC::Server* dtcServer,string key,struct item &it,u_int8_t field_type,int doc_version,DTC::Result &rst){
int ret = 0;
stringstream index_sstr;
index_sstr << "[";
int count = 0;
vector<uint32_t>::iterator iter = it.indexs.begin();
for (; iter != it.indexs.end(); iter++) {
if (count++ > 25) {
break;
}
index_sstr << *iter << ",";
}
string index_str = index_sstr.str();
index_str = index_str.substr(0, index_str.size()-1);
index_str.append("]");
DTC::InsertRequest insertReq(dtcServer);
insertReq.SetKey(key.c_str());
insertReq.Set("doc_id", it.doc_id.c_str());
insertReq.Set("field", field_type);
insertReq.Set("word_freq", it.freq);
insertReq.Set("weight", 1);
insertReq.Set("extend","");
insertReq.Set("doc_version",doc_version);
insertReq.Set("created_time",time(NULL));
insertReq.Set("location", index_str.c_str());
ret = insertReq.Execute(rst);
return ret;
}
int CTaskImage::insert_index_dtc(DTC::Server* dtcServer,string key,struct item &it,u_int8_t field_type,int doc_version,Json::Value &res){
int ret = 0;
char tmp[41] = { '0' };
snprintf(tmp, sizeof(tmp), "%40s", it.doc_id.c_str());
dtcServer->SetAccessKey(tmp);
DTC::Result rst;
ret = insert_index_execute(dtcServer,key,it,field_type,doc_version,rst);
if (ret != 0)
{
log_error("insert request error! ,errno %d ,errmsg %s, errfrom %s\n", ret,rst.ErrorMessage(), rst.ErrorFrom());
res[MESSAGE] = rst.ErrorMessage();
return -1;
}
return 0;
}
int CTaskImage::do_insert_index(DTC::Server* dtcServer, map<string, item> &word_map, map<string, item> &title_map,uint64_t app_id,int doc_version,Json::Value &res) {
int ret;
map<string, item>::iterator map_iter = word_map.begin();
for (; map_iter != word_map.end(); map_iter++) {
string key = gen_dtc_key_string(app_id, "00", map_iter->first);
item it = map_iter->second;
ret = insert_index_dtc(dtcServer,key,it,3,doc_version,res);
if(ret != 0)
return RT_ERROR_INSERT_INDEX_DTC;
}
map_iter = title_map.begin();
for (; map_iter != title_map.end(); map_iter++) {
string key = gen_dtc_key_string(app_id, "00", map_iter->first);
item it = map_iter->second;
ret = insert_index_dtc(dtcServer,key,it,3,doc_version,res);
if(ret != 0){
return RT_ERROR_INSERT_INDEX_DTC;
}
}
return 0;
}
int CTaskImage::pre_process(void){
DTCTools *dtc_tools = DTCTools::Instance();
dtc_tools->init_servers(index_servers,IndexConf::Instance()->GetDTCIndexConfig());
return 0;
}
void CTaskImage::do_stat_word_freq(vector<vector<string> > &strss, string &doc_id, uint32_t appid, map<string, item> &word_map,Json::Value &res) {
string word;
uint32_t id = 0;
ostringstream oss;
vector<vector<string> >::iterator iters = strss.begin();
uint32_t index = 0;
for(;iters != strss.end(); iters++){
index++;
vector<string>::iterator iter = iters->begin();
log_debug("start do_stat_word_freq, appid = %u\n",appid);
for (; iter != iters->end(); iter++) {
word = *iter;
if (!SplitManager::Instance()->wordValid(word, appid, id)){
continue;
}
log_debug("id == %u\n",id);
if (word_map.find(word) == word_map.end()) {
item it;
it.doc_id = doc_id;
it.freq = 1;
it.indexs.push_back(index);
word_map.insert(make_pair(word, it));
}
else {
word_map[word].freq++;
word_map[word].indexs.push_back(index);
}
oss << (*iter) << "|";
}
}
log_debug("split: %s",oss.str().c_str());
}
static int decode_request(const Json::Value &req, Json::Value &subreq, uint32_t &id, uint32_t &count){
if(req.isMember("table_content") && req["table_content"].isArray()){
subreq = req["table_content"];
}else{
return RT_NO_TABLE_CONTENT;
}
if(req.isMember("appid") && req["appid"].isInt()){
id = req["appid"].asInt();
}else{
return RT_NO_APPID;
}
if(req.isMember("fields_count") && req["fields_count"].isInt()){
count = req["fields_count"].asInt();
}else{
return RT_NO_FIELD_COUNT;
}
return 0;
}
int CTaskImage::update_sanpshot_dtc(const UserTableContent &fields,int doc_version,Json::Value &res){
int ret = 0;
DTC::Server* dtc_server = index_servers.GetServer();
if(NULL == dtc_server){
log_error("snapshot server connect error!");
return RT_ERROR_GET_SNAPSHOT;
}
DTC::UpdateRequest updateReq(dtc_server);
ret = updateReq.SetKey(gen_dtc_key_string(fields.appid, "00", fields.doc_id).c_str());
updateReq.Set("doc_version", doc_version);
if (fields.content.length() > 0)
updateReq.Set("extend", fields.content.c_str());
updateReq.Set("weight", fields.weight);
updateReq.Set("created_time", fields.publish_time);
DTC::Result rst;
ret = updateReq.Execute(rst);
if (ret != 0)
{
log_error("updateReq error! ,errno %d ,errmsg %s, errfrom %s\n", ret, rst.ErrorMessage(), rst.ErrorFrom());
return RT_ERROR_UPDATE_SNAPSHOT;
}
return ret;
}
static int decode_fields(Json::Value table_content,UserTableContent &fields){
string cmd;
time_t now = time(NULL);
if(table_content.isMember("cmd") && table_content["cmd"].isString()){
cmd = table_content["cmd"].asString();
if(cmd == "add" || cmd == "update"){
if(table_content.isMember("fields") && table_content["fields"].isObject()){
Json::Value field = table_content["fields"];
if(field.isMember("doc_id") && field["doc_id"].isString()){
fields.doc_id = field["doc_id"].asString();
}
if(field.isMember("title") && field["title"].isString()){
fields.title = field["title"].asString();
}
if(field.isMember("content") && field["content"].isString()){
fields.content = field["content"].asString();
}
if(field.isMember("author") && field["author"].isString()){
fields.author = field["author"].asString();
}
if(field.isMember("url") && field["url"].isString()){
fields.description = field["url"].asString();
}
if(field.isMember("weight") && field["weight"].isInt()){
fields.weight = field["weight"].asInt();
}else{
fields.weight = 1;
}
if(field.isMember("publish_time") && field["publish_time"].isInt()){
fields.publish_time = field["publish_time"].asInt();
}else{
fields.publish_time = now;
}
return RT_CMD_ADD;
}
}else if(cmd == "delete"){
Json::Value field = table_content["fields"];
if(field.isMember("doc_id") && field["doc_id"].isString()){
fields.doc_id = field["doc_id"].asString();
return RT_CMD_DELETE;
}
}else{
return RT_ERROR_FIELD_CMD;
}
}
return 0;
}
int CTaskImage::index_gen_process(Json::Value &req,Json::Value &res){
vector<vector<string> > split_content;
vector<vector<string> > split_title;
int doc_version = 0,old_version = 0;
uint32_t app_id,fields_count = 0;
int ret = 0;
Json::Value table_content;
map<string, item> word_map;
map<string, item> title_map;
ret = decode_request(req, table_content, app_id,fields_count);
if(ret != 0){
return ret;
}
if(fields_count == 0 || fields_count != table_content.size()){
return RT_ERROR_FIELD_COUNT;
}
DTC::Server* dtcServer = index_servers.GetServer();
for(int i = 0;i < (int)table_content.size();i++){
doc_version = 0; old_version = 0;
UserTableContent fields(app_id);
ret = decode_fields(table_content[i],fields);
if(RT_CMD_ADD == ret){
ret = get_snapshot_active_doc(fields,old_version,res);
if(0 == ret){
doc_version = ++old_version;
}else if(ret != RT_NO_THIS_DOC) return ret;
split_content = SplitManager::Instance()->split(fields.content,fields.appid);
split_title = SplitManager::Instance()->split(fields.title,fields.appid);
do_stat_word_freq(split_content, fields.doc_id,fields.appid, word_map,res);
do_stat_word_freq(split_title, fields.doc_id, fields.appid, title_map,res);
ret = do_insert_index(dtcServer, word_map, title_map,app_id,doc_version,res);
if(0 != ret){
return ret;
}
if(doc_version != 0){//need update
update_sanpshot_dtc(fields,doc_version,res);
}else{
insert_snapshot_dtc(fields,doc_version,res);//insert the snapshot doc
}
word_map.clear();
title_map.clear();
}
else if(RT_CMD_DELETE == ret){
ret = delete_snapshot_dtc(fields.doc_id,fields.appid,res);//not use the doc_version curr
}
}
return ret;
}
void CTaskImage::TaskNotify(CTaskRequest * curr)
{
log_debug("CTaskImage::TaskNotify start");
common::CallerInfo caller_info = common::ProfilerMonitor::GetInstance().RegisterInfo(std::string("searchEngine.searchService.imageReportTask"));
//there is a race condition here:
//curr may be deleted during process (in task->ReplyNotify())
int ret;
Json::Reader reader;
Json::FastWriter writer;
Json::Value value, res;
std::string req;
res["code"] = 0;
CTaskRequest * task = curr;
if(NULL == curr){
common::ProfilerMonitor::GetInstance().FunctionError(caller_info);
common::ProfilerMonitor::GetInstance().RegisterInfoEnd(caller_info);
return;
}
if(SERVICE_PIC != task->GetReqCmd()){
res["code"] = RT_ERROR_SERVICE_TYPE;
res["reqcmd"] = task->GetReqCmd();
res["message"] = "service type wrong! need 109";
goto end;
}
req = task->buildRequsetString();
log_debug("recv:%s\n",req.c_str());
if(!reader.parse(req,value,false))
{
log_error("parse json error!\ndata:%s errors:%s\n",req.c_str(),reader.getFormattedErrorMessages().c_str());
res["code"] = RT_PARSE_JSON_ERR;
res["message"] = reader.getFormattedErrorMessages();
res["data"] = req;
goto end;
}
if(!value.isObject()){
log_error("parse json error!\ndata:%s errors:%s\n",req.c_str(),reader.getFormattedErrorMessages().c_str());
res["code"] = RT_PARSE_JSON_ERR;
res["message"] = "it's not a json";
res["data"] = req;
goto end;
}
ret = index_gen_process(value,res);
if(0 != ret){
res["code"] = ret;
}
end:
task->setResult(writer.write(res));
task->ReplyNotify();
common::ProfilerMonitor::GetInstance().RegisterInfoEnd(caller_info);
return;
}

View File

@ -0,0 +1,68 @@
/*
* =====================================================================================
*
* Filename: image_service.h
*
* Description: CTaskImage class definition.
*
* Version: 1.0
* Created: 09/08/2020 10:02:05 PM
* Revision: none
* Compiler: gcc
*
* Author: shrewdlin, linjinming@jd.com
* Company: JD.com, Inc.
*
* =====================================================================================
*/
#ifndef IMAGE_SERVICE_H_
#define IMAGE_SERVICE_H_
#include <set>
#include <vector>
#include <sstream>
#include "request_base.h"
#include "index_conf.h"
#include "dtcapi.h"
#include "comm.h"
#include "split_manager.h"
using namespace std;
class CPollThread;
class CTaskRequest;
class SplitManager;
class DTCServers;
class CTaskImage : public CTaskDispatcher<CTaskRequest>
{
private:
CPollThread * ownerThread;
CRequestOutput<CTaskRequest> output;
DTC::DTCServers index_servers;
private:
int insert_index_dtc(DTC::Server* dtcServer,string key,struct item &it,u_int8_t field_type,int doc_version,Json::Value &res);
int do_insert_index(DTC::Server* dtcServer, map<string, item> &word_map, map<string, item> &title_map,uint64_t app_id,int doc_version,Json::Value &res);
void do_stat_word_freq(vector<vector<string> > &strss, string &doc_id, uint32_t appid, map<string, item> &word_map,Json::Value &res);
int get_snapshot_active_doc(const UserTableContent &fields,int &active,Json::Value &res);
int delete_snapshot_dtc(string &doc_id,uint32_t appid,Json::Value &res);
int insert_snapshot_dtc(const UserTableContent &fields,int &doc_version,Json::Value &res);
int update_sanpshot_dtc(const UserTableContent &fields,int doc_version,Json::Value &res);
public:
CTaskImage(CPollThread * o);
virtual ~CTaskImage();
int index_gen_process(Json::Value &req,Json::Value &res);
int pre_process(void);
inline void BindDispatcher(CTaskDispatcher<CTaskRequest> *p)
{
output.BindDispatcher(p);
}
virtual void TaskNotify(CTaskRequest * curr);
};
#endif /* IMAGE_SERVICE_H_ */

View File

@ -0,0 +1,274 @@
/*
* =====================================================================================
*
* Filename: index_clipping.cc
*
* Description: IndexClipping class definition.
*
* Version: 1.0
* Created: 09/08/2020 10:02:05 PM
* Revision: none
* Compiler: gcc
*
* Author: shrewdlin, linjinming@jd.com
* Company: JD.com, Inc.
*
* =====================================================================================
*/
#include <iostream>
#include <string>
#include "log.h"
#include "index_clipping.h"
IndexClipping::IndexClipping(DTC::Server* server) {
indexSet.clear();
snapshot_server = server;
}
IndexClipping::~IndexClipping() {
}
static int get_snapshot_execute(DTC::Server* dtc_server,string &doc_id,uint32_t appid,int doc_version,DTC::Result &rst,int top){
DTC::GetRequest getReq(dtc_server);
int ret = 0;
ret = getReq.SetKey(doc_id.c_str());
ret = getReq.EQ("doc_version",doc_version);
ret = getReq.EQ("appid",appid);
ret = getReq.EQ("active",1);
ret = getReq.EQ("top",top);
ret = getReq.Execute(rst);
return ret;
}
bool IndexClipping::is_active_doc(string &doc_id,uint32_t appid,int doc_version,int top){
int ret;
DTC::Result rst;
ret = get_snapshot_execute(snapshot_server,doc_id,appid,doc_version,rst,top);
if (ret != 0) {
if (ret == -110) {
rst.Reset();
ret = get_snapshot_execute(snapshot_server,doc_id,appid,doc_version,rst,top);
if (ret != 0) {
log_error("get request error! errcode %d,errmsg %s, errfrom %s", ret, rst.ErrorMessage(), rst.ErrorFrom());
return true;//not clipping
}
}
else {
log_error("get request error! errcode %d,errmsg %s, errfrom %s", ret, rst.ErrorMessage(), rst.ErrorFrom());
return true;//not clipping
}
}
struct index_item item;
if (rst.NumRows() <= 0) {
return false;
}
else {
return true;
}
return true;
}
bool IndexClipping::do_delete_index_dtc(DTC::Server* dtc_server, string key, const struct index_item& item){
int ret = 0;
DTC::DeleteRequest deleteReq(dtc_server);
ret = deleteReq.SetKey(key.c_str());
ret = deleteReq.EQ("doc_id", item.doc_id.c_str());
ret = deleteReq.EQ("created_time", item.created_time);
ret = deleteReq.EQ("field", item.field);
ret = deleteReq.EQ("doc_version",item.doc_version);
DTC::Result rst;
ret = deleteReq.Execute(rst);
if (ret != 0)
{
log_error("delete request error! ,errno %d ,errmsg %s, errfrom %s\n", ret,rst.ErrorMessage(), rst.ErrorFrom());
return false;
}
log_debug("delete key = %s doc_id = %s doc_verson = %d field = %d createdtime = %d",key.c_str(),item.doc_id.c_str(),item.doc_version,item.field,item.created_time);
return true;
}
bool IndexClipping::do_delete_top_index_dtc(DTC::Server* dtc_server,string key, const struct index_item&item){
int ret = 0;
DTC::DeleteRequest deleteReq(dtc_server);
ret = deleteReq.SetKey(key.c_str());
ret = deleteReq.EQ("doc_id", item.doc_id.c_str());
ret = deleteReq.EQ("created_time", item.created_time);
ret = deleteReq.EQ("doc_version",item.doc_version);
DTC::Result rst;
ret = deleteReq.Execute(rst);
if (ret != 0)
{
log_error("delete request error! ,errno %d ,errmsg %s, errfrom %s\n", ret,rst.ErrorMessage(), rst.ErrorFrom());
return false;
}
log_debug("delete key = %s doc_id = %s doc_verson = %d createdtime = %d",key.c_str(),item.doc_id.c_str(),item.doc_version,item.created_time);
return true;
}
bool IndexClipping::do_index_clipping(DTC::Server* dtc_server,string key,uint32_t rows_limit){
if(indexSet.size() <= ((rows_limit * 80) / 100)){
indexSet.clear();
return true;
}
uint64_t slipping_count = indexSet.size() - ((rows_limit * 80) / 100);
set<struct index_item>::iterator it = indexSet.begin();
for(uint count = 0;it != indexSet.end() && count < slipping_count;it++){
if(!do_delete_index_dtc(dtc_server,key,*it))
log_error("do delete dtc error!!");
count ++;
}
indexSet.clear();
return true;
}
bool IndexClipping::do_top_index_clipping(DTC::Server* dtc_server,string key,uint32_t rows_limit){
if(indexSet.size() <= ((rows_limit * 80) / 100)){
indexSet.clear();
return true;
}
uint64_t slipping_count = indexSet.size() - ((rows_limit * 80) / 100);
set<struct index_item>::iterator it = indexSet.begin();
for(uint count = 0;it != indexSet.end() && count < slipping_count;it++){
if(!do_delete_top_index_dtc(dtc_server,key,*it))
log_error("do delete dtc error!!");
count ++;
}
indexSet.clear();
return true;
}
static int get_index_dtc_execute(DTC::Server* dtc_server,string key, DTC::Result &rst){
int ret = 0;
DTC::GetRequest getReq(dtc_server);
ret = getReq.SetKey(key.c_str());
ret = getReq.Need("created_time");
ret = getReq.Need("doc_id");
ret = getReq.Need("field");
ret = getReq.Need("word_freq");
ret = getReq.Need("doc_version");
ret = getReq.Execute(rst);
return ret;
}
bool IndexClipping::get_rows_and_index_clipping(DTC::Server* dtc_server,string key,uint32_t rows_limit){
log_debug("get_rows_and_index_clipping start!");
int ret;
pair<set<struct index_item>::iterator,bool> ret_p;
DTC::Result rst;
ret = get_index_dtc_execute(dtc_server,key,rst);
if (ret != 0) {
if (ret == -110) {
rst.Reset();
ret = get_index_dtc_execute(dtc_server,key,rst);
if (ret != 0) {
log_error("get request error! errcode %d,errmsg %s, errfrom %s", ret, rst.ErrorMessage(), rst.ErrorFrom());
return false;
}
}
else {
log_error("get request error! errcode %d,errmsg %s, errfrom %s", ret, rst.ErrorMessage(), rst.ErrorFrom());
return false;
}
}
int cnt = rst.NumRows();
if (rst.NumRows() <= 0) {
log_debug("no data in this node");
return false;
}
else {
for (int i = 0; i < cnt; i++) {
rst.FetchRow();
struct index_item item;
item.created_time = rst.IntValue("created_time");
item.doc_id = rst.StringValue("doc_id");
item.field = rst.IntValue("field");
item.freq = rst.IntValue("word_freq");
item.doc_version = rst.IntValue("doc_version");
ret_p = indexSet.insert(item);
if(ret_p.second == false){
if(!do_delete_index_dtc(dtc_server,key,item))
log_error("do delete dtc error!");
}
}
}
return do_index_clipping(dtc_server,key,rows_limit);
}
static int get_top_index_dtc_execute(DTC::Server* dtc_server,string key, DTC::Result &rst){
int ret = 0;
DTC::GetRequest getReq(dtc_server);
ret = getReq.SetKey(key.c_str());
ret = getReq.Need("created_time");
ret = getReq.Need("doc_id");
ret = getReq.Need("doc_version");
ret = getReq.Need("end_time");
ret = getReq.Execute(rst);
return ret;
}
bool IndexClipping::get_rows_and_top_index_clipping(DTC::Server* dtc_server,string key,uint32_t rows_limit){
log_debug("get_rows_and_top_index_clipping start!");
int ret;
time_t now_time = time(NULL);
pair<set<struct index_item>::iterator,bool> ret_p;
DTC::Result rst;
ret = get_top_index_dtc_execute(dtc_server,key,rst);
if (ret != 0) {
if (ret == -110) {
rst.Reset();
ret = get_top_index_dtc_execute(dtc_server,key,rst);
if (ret != 0) {
log_error("get request error! errcode %d,errmsg %s, errfrom %s", ret, rst.ErrorMessage(), rst.ErrorFrom());
return false;
}
}
else {
log_error("get request error! errcode %d,errmsg %s, errfrom %s", ret, rst.ErrorMessage(), rst.ErrorFrom());
return false;
}
}
int cnt = rst.NumRows();
struct index_item item;
if (rst.NumRows() <= 0) {
}
else {
for (int i = 0; i < cnt; i++) {
rst.FetchRow();
item.created_time = rst.IntValue("created_time");
item.doc_id = rst.StringValue("doc_id");
item.freq = 0;
item.field = 0;
item.doc_version = rst.IntValue("doc_version");
item.end_time = rst.IntValue("end_time");
if(item.end_time < now_time){
if(!do_delete_top_index_dtc(dtc_server,key,item))
log_error("do delete dtc error!");
continue;
}
ret_p = indexSet.insert(item);
if(ret_p.second == false){
if(!do_delete_top_index_dtc(dtc_server,key,item))
log_error("do delete dtc error!");
}
}
}
return do_top_index_clipping(dtc_server,key,rows_limit);
}

View File

@ -0,0 +1,95 @@
/*
* =====================================================================================
*
* Filename: index_clipping.h
*
* Description: IndexClipping class definition.
*
* Version: 1.0
* Created: 09/08/2020 10:02:05 PM
* Revision: none
* Compiler: gcc
*
* Author: shrewdlin, linjinming@jd.com
* Company: JD.com, Inc.
*
* =====================================================================================
*/
#ifndef SRC_INDEX_GEN_INDEX_CLIPPING_H_
#define SRC_INDEX_GEN_INDEX_CLIPPING_H_
#include <map>
#include <set>
#include <vector>
#include <sstream>
#include "index_conf.h"
#include "dtcapi.h"
using namespace std;
struct index_item {
uint32_t created_time;
uint8_t field;
uint32_t freq;
string doc_id;
int doc_version;
time_t end_time;
bool friend operator<(const struct index_item &left, const struct index_item &right) //对于<的重载
{
if (left.field < right.field)
{
return true;
}
else if(left.field == right.field)
{
if(left.freq < right.freq){
return true;
}
else if(left.freq == right.freq){
if(left.created_time < right.created_time){
return true;
}else if(left.created_time == right.created_time){
if(left.doc_id < right.doc_id)
return true;
else
return false;
}
else{
return false;
}
}else{
return false;
}
}
else
{
return false;
}
}
};
class IndexClipping {
public:
IndexClipping(DTC::Server* server);
~IndexClipping();
bool get_rows_and_index_clipping(DTC::Server* dtc_server,string key,uint32_t rows_limit);
bool get_rows_and_top_index_clipping(DTC::Server* dtc_server,string key,uint32_t rows_limit);
private:
bool do_delete_index_dtc(DTC::Server* dtc_server,string key,const struct index_item &item);
bool do_index_clipping(DTC::Server* dtc_server,string key,uint32_t rows_limit);
bool do_delete_top_index_dtc(DTC::Server* dtc_server,string key,const struct index_item &item);
bool do_top_index_clipping(DTC::Server* dtc_server,string key,uint32_t rows_limit);
bool is_active_doc(string &doc_id,uint32_t appid,int doc_version,int top);
private:
set<struct index_item> indexSet;
DTC::Server* snapshot_server;
};
#endif /* SRC_INDEX_GEN_INDEX_CLIPPING_H_ */

View File

@ -0,0 +1,276 @@
/*
* =====================================================================================
*
* Filename: index_conf.cc
*
* Description: IndexConf class definition.
*
* Version: 1.0
* Created: 09/08/2020 10:02:05 PM
* Revision: none
* Compiler: gcc
*
* Author: shrewdlin, linjinming@jd.com
* Company: JD.com, Inc.
*
* =====================================================================================
*/
#include "index_conf.h"
#include "log.h"
#include "comm.h"
#include <fstream>
SGlobalIndexConfig::SGlobalIndexConfig() {
iTimeout = 300;
iTimeInterval = 0;
iLogLevel = 4;
background = 1;
service_type = 106;
}
UserTableContent::UserTableContent(uint32_t app_id) {
appid = app_id;
weight = 1;
publish_time = time(NULL);
top = 0;
top_start_time = 0;
top_end_time = 0;
}
int IndexConf::ParseDTCPara(const char *dtc_name,SDTCHost &dtchost) {
Json::Value dtc_config;
if (m_value.isMember(dtc_name) && m_value[dtc_name].isObject()) {
dtc_config = m_value[dtc_name];
if (dtc_config.isMember("table_name") && dtc_config["table_name"].isString()) {
dtchost.szTablename = dtc_config["table_name"].asString();
}else {
log_error("parse data error!");
return -RT_PARSE_JSON_ERR;
}
if (dtc_config.isMember("accesskey") && dtc_config["accesskey"].isString()) {
dtchost.szAccesskey = dtc_config["accesskey"].asString();
}else {
log_error("parse data error!");
return -RT_PARSE_JSON_ERR;
}
if (dtc_config.isMember("timeout") && dtc_config["timeout"].isInt()) {
dtchost.uTimeout = dtc_config["timeout"].asInt();
}else {
log_error("parse data error!");
return -RT_PARSE_JSON_ERR;
}
if (dtc_config.isMember("keytype") && dtc_config["keytype"].isInt()) {
dtchost.uKeytype = dtc_config["keytype"].asInt();
}else {
log_error("parse data error!");
return -RT_PARSE_JSON_ERR;
}
if (dtc_config.isMember("route") && dtc_config["route"].isArray()) {
for (int i = 0; i < (int)dtc_config["route"].size(); i++) {
SDTCroute dtc_route;
Json::Value route = dtc_config["route"][i];
if (route.isMember("ip") && route["ip"].isString()) {
dtc_route.szIpadrr = route["ip"].asString();
}else {
log_error("parse data error!");
return -RT_PARSE_JSON_ERR;
}
if (route.isMember("bid") && route["bid"].isInt()) {
dtc_route.uBid = route["bid"].asInt();
}else {
log_error("parse data error!");
return -RT_PARSE_JSON_ERR;
}
if (route.isMember("port") && route["port"].isInt()) {
dtc_route.uPort = route["port"].asInt();
}else {
log_error("parse data error!");
return -RT_PARSE_JSON_ERR;
}
if (route.isMember("weight") && route["weight"].isInt()) {
dtc_route.uWeight = route["weight"].asInt();
}else {
log_error("parse data error!");
return -RT_PARSE_JSON_ERR;
}
if (route.isMember("status") && route["status"].isInt()) {
dtc_route.uStatus = route["status"].asInt();
}else {
log_error("parse data error!");
return -RT_PARSE_JSON_ERR;
}
dtchost.vecRoute.push_back(dtc_route);
}
}
else {
log_error("parse data error!");
return -RT_PARSE_JSON_ERR;
}
}
else {
log_error("parse data error!");
return -RT_PARSE_JSON_ERR;
}
return 0;
}
int IndexConf::ParseGlobalPara()
{
if (m_value.isMember("listen_addr") && m_value["listen_addr"].isString()) {
m_GlobalConf.listen_addr = m_value["listen_addr"].asString();
}
else {
log_error("parse data error!");
return -RT_PARSE_JSON_ERR;
}
if (m_value.isMember("log") && m_value["log"].isString()) {
m_GlobalConf.logPath = m_value["log"].asString();
}
else {
log_error("parse data error!");
return -RT_PARSE_JSON_ERR;
}
if (m_value.isMember("log_level") && m_value["log_level"].isInt()) {
m_GlobalConf.iLogLevel = m_value["log_level"].asInt();
}
else {
log_error("parse data error!");
return -RT_PARSE_JSON_ERR;
}
if (m_value.isMember("pid_file") && m_value["pid_file"].isString()) {
m_GlobalConf.pid_file = m_value["pid_file"].asString();
}
else {
log_error("parse data error!");
return -RT_PARSE_JSON_ERR;
}
if (m_value.isMember("timeout") && m_value["timeout"].isInt()) {
m_GlobalConf.iTimeout = m_value["timeout"].asInt();
}
else {
m_GlobalConf.iTimeout = 5000;
}
if (m_value.isMember("words_file") && m_value["words_file"].isString()) {
m_GlobalConf.sWordsPath = m_value["words_file"].asString();
}
else {
log_error("parse data error!");
return -RT_PARSE_JSON_ERR;
}
if (m_value.isMember("service_type") && m_value["service_type"].isString()) {
if(m_value["service_type"].asString() == "top_index"){
m_GlobalConf.service_type = CMD_TOP_INDEX;//top_index
}else if(m_value["service_type"].asString() == "snapshot"){
m_GlobalConf.service_type = CMD_SNAPSHOT;//snapshot
}else if(m_value["service_type"].asString() == "image"){
m_GlobalConf.service_type = CMD_IMAGE_REPORT;//image_report
}
else
m_GlobalConf.service_type = CMD_INDEX_GEN;//index_gen
m_GlobalConf.service_name = m_value["service_type"].asString();
}
else {
m_GlobalConf.service_type = CMD_INDEX_GEN;//index_gen
}
if (m_value.isMember("stop_words_path") && m_value["stop_words_path"].isString()) {
m_GlobalConf.stopWordsPath = m_value["stop_words_path"].asString();
}
else {
log_error("parse data error!");
return -RT_PARSE_JSON_ERR;
}
if (m_value.isMember("words_base_path") && m_value["words_base_path"].isString()) {
m_GlobalConf.wordsBasePath = m_value["words_base_path"].asString();
}
else {
log_error("parse data error!");
return -RT_PARSE_JSON_ERR;
}
if (m_value.isMember("training_path") && m_value["training_path"].isString()) {
m_GlobalConf.trainingPath = m_value["training_path"].asString();
}
else {
log_error("parse data error!");
return -RT_PARSE_JSON_ERR;
}
if (m_value.isMember("daemon") && m_value["daemon"].isBool()) {
m_GlobalConf.background = m_value["daemon"].asBool();
}
else {
log_error("parse data error!");
return -RT_PARSE_JSON_ERR;
}
if (m_value.isMember("split_mode") && m_value["split_mode"].isString()) {
m_GlobalConf.sSplitMode = m_value["split_mode"].asString();
}
else
{
m_GlobalConf.sSplitMode = "PrePostNGram";
}
if (m_value.isMember("phonetic_path") && m_value["phonetic_path"].isString()) {
m_GlobalConf.sPhoneticPath = m_value["phonetic_path"].asString();
}
else {
log_error("parse data[phonetic_path] error!");
return -RT_PARSE_JSON_ERR;
}
if (m_value.isMember("character_path") && m_value["character_path"].isString()) {
m_GlobalConf.sCharacterPath = m_value["character_path"].asString();
}
else {
log_error("parse data[character_path] error!");
return -RT_PARSE_JSON_ERR;
}
if (m_value.isMember("phonetic_base_file") && m_value["phonetic_base_file"].isString()) {
m_GlobalConf.sPhoneticBasePath = m_value["phonetic_base_file"].asString();
}
else {
log_error("parse data[phonetic_base_file] error!");
return -RT_PARSE_JSON_ERR;
}
return 0;
}
bool IndexConf::ParseConf(string path) {
bool ret = false;
Json::Reader reader;
ifstream file(path.c_str());
if (file) {
ret = reader.parse(file, m_value);
if (ret == false) {
log_error("parse json error!");
return false;
}
if (ParseGlobalPara() != 0) {
log_error("parse json error!");
return false;
}
if (ParseDTCPara("dtc_index_config",m_DTCIndexHost) != 0) {
log_error("parse json error!");
return false;
}
if (ParseDTCPara("dtc_intelligent_config", m_DTCIntelligentHost) != 0) {
log_error("parse json error!");
return false;
}
}
else {
log_error("open file error!");
return false;
}
return true;
}

View File

@ -0,0 +1,114 @@
/*
* =====================================================================================
*
* Filename: index_conf.h
*
* Description: IndexConf class definition.
*
* Version: 1.0
* Created: 09/08/2020 10:02:05 PM
* Revision: none
* Compiler: gcc
*
* Author: shrewdlin, linjinming@jd.com
* Company: JD.com, Inc.
*
* =====================================================================================
*/
#ifndef __INDEX_CONF_H__
#define __INDEX_CONF_H__
#include <vector>
#include <string>
#include <stdint.h>
#include "singleton.h"
#include "json/json.h"
#include <stdint.h>
#include "split_tool.h"
using namespace std;
class SGlobalIndexConfig {
public:
SGlobalIndexConfig();
~SGlobalIndexConfig(){}
int iTimeout;
int iTimeInterval;
int iLogLevel;
int service_type;
string programName;
string listen_addr;
string pid_file;
string logPath;
string sWordsPath;
string sEnWordsPath;
string sCharacterPath;
string sPhoneticPath;
string sPhoneticBasePath;
string stopWordsPath;
string wordsBasePath;
string trainingPath;
string service_name;
bool background;
string sSplitMode;
};
class UserTableContent{
public:
UserTableContent(uint32_t app_id);
~UserTableContent(){}
uint32_t appid;
string doc_id;
string title;
string content;
string author;
string description;
string sp_words;
int weight;
int publish_time;
int top;
int top_start_time;
int top_end_time;
};
class IndexConf {
public:
IndexConf() {
}
static IndexConf *Instance()
{
return CSingleton<IndexConf>::Instance();
}
static void Destroy()
{
CSingleton<IndexConf>::Destroy();
}
bool ParseConf(string path);
SGlobalIndexConfig &GetGlobalConfig(){
return m_GlobalConf;
}
SDTCHost &GetDTCIndexConfig(){
return m_DTCIndexHost;
}
SDTCHost &GetDTCIntelligentConfig() {
return m_DTCIntelligentHost;
}
private:
int ParseDTCPara(const char *dtc_name,SDTCHost &dtchost) ;
int ParseGlobalPara();
int ParseMYSQLPara();
private:
SGlobalIndexConfig m_GlobalConf;
SDTCHost m_DTCIndexHost;
SDTCHost m_DTCIntelligentHost;
Json::Value m_value;
};
#endif

View File

@ -0,0 +1,689 @@
/*
* =====================================================================================
*
* Filename: index_tbl_op.cc
*
* Description: IndexConf class definition.
*
* Version: 1.0
* Created: 09/08/2020 10:02:05 PM
* Revision: none
* Compiler: gcc
*
* Author: shrewdlin, linjinming@jd.com
* Company: JD.com, Inc.
*
* =====================================================================================
*/
#include "index_tbl_op.h"
#include "index_clipping.h"
#include "add_request_proc.h"
#include <sstream>
CIndexTableManager g_IndexInstance;
CIndexTableManager g_delIndexInstance;
CIndexTableManager g_hanpinIndexInstance;
static char* gen_access_key(string doc_id){
static char tmp[41] = {'0'};
snprintf(tmp, sizeof(tmp), "%40s", doc_id.c_str());
return tmp;
}
static int get_snapshot_execute(DTC::Server* dtc_server, const UserTableContent &fields, DTC::Result &rst){
DTC::GetRequest getReq(dtc_server);
int ret = 0;
ret = getReq.SetKey(gen_dtc_key_string(fields.appid, "10", fields.doc_id).c_str());
ret = getReq.Need("trans_version");
ret = getReq.Execute(rst);
return ret;
}
static int insert_index_execute(DTC::Server* dtcServer,string key,struct item &it,u_int8_t field_type,int doc_version,DTC::Result &rst){
int ret = 0;
stringstream index_sstr;
index_sstr << "[";
int count = 0;
vector<uint32_t>::iterator iter = it.indexs.begin();
for (; iter != it.indexs.end(); iter++) {
if (count++ > 25) {
break;
}
index_sstr << *iter << ",";
}
string index_str = index_sstr.str();
index_str = index_str.substr(0, index_str.size()-1);
index_str.append("]");
if (it.indexs.size() == 0) {
index_str = "";
}
DTC::InsertRequest insertReq(dtcServer);
insertReq.SetKey(key.c_str());
insertReq.Set("doc_id", it.doc_id.c_str());
insertReq.Set("field", field_type);
insertReq.Set("word_freq", it.freq);
insertReq.Set("weight", 1);
insertReq.Set("extend", it.extend.c_str());
insertReq.Set("doc_version",doc_version);
insertReq.Set("trans_version",doc_version);
insertReq.Set("created_time",time(NULL));
insertReq.Set("location", index_str.c_str());
insertReq.Set("start_time", 0);
insertReq.Set("end_time", 0);
ret = insertReq.Execute(rst);
return ret;
}
static int insert_intelligent_execute(DTC::Server* dtcServer, string key, string doc_id, string word, const IntelligentInfo &info, DTC::Result &rst, int doc_version) {
int ret = 0;
DTC::InsertRequest insertReq(dtcServer);
insertReq.SetKey(key.c_str());
insertReq.Set("word", word.c_str());
insertReq.Set("doc_id", doc_id.c_str());
insertReq.Set("doc_version", doc_version);
insertReq.Set("charact_id_01", info.charact_id[0]);
insertReq.Set("charact_id_02", info.charact_id[1]);
insertReq.Set("charact_id_03", info.charact_id[2]);
insertReq.Set("charact_id_04", info.charact_id[3]);
insertReq.Set("charact_id_05", info.charact_id[4]);
insertReq.Set("charact_id_06", info.charact_id[5]);
insertReq.Set("charact_id_07", info.charact_id[6]);
insertReq.Set("charact_id_08", info.charact_id[7]);
insertReq.Set("phonetic_id_01", info.phonetic_id[0]);
insertReq.Set("phonetic_id_02", info.phonetic_id[1]);
insertReq.Set("phonetic_id_03", info.phonetic_id[2]);
insertReq.Set("phonetic_id_04", info.phonetic_id[3]);
insertReq.Set("phonetic_id_05", info.phonetic_id[4]);
insertReq.Set("phonetic_id_06", info.phonetic_id[5]);
insertReq.Set("phonetic_id_07", info.phonetic_id[6]);
insertReq.Set("phonetic_id_08", info.phonetic_id[7]);
insertReq.Set("initial_char_01", info.initial_char[0].c_str());
insertReq.Set("initial_char_02", info.initial_char[1].c_str());
insertReq.Set("initial_char_03", info.initial_char[2].c_str());
insertReq.Set("initial_char_04", info.initial_char[3].c_str());
insertReq.Set("initial_char_05", info.initial_char[4].c_str());
insertReq.Set("initial_char_06", info.initial_char[5].c_str());
insertReq.Set("initial_char_07", info.initial_char[6].c_str());
insertReq.Set("initial_char_08", info.initial_char[7].c_str());
insertReq.Set("initial_char_09", info.initial_char[8].c_str());
insertReq.Set("initial_char_10", info.initial_char[9].c_str());
insertReq.Set("initial_char_11", info.initial_char[10].c_str());
insertReq.Set("initial_char_12", info.initial_char[11].c_str());
insertReq.Set("initial_char_13", info.initial_char[12].c_str());
insertReq.Set("initial_char_14", info.initial_char[13].c_str());
insertReq.Set("initial_char_15", info.initial_char[14].c_str());
insertReq.Set("initial_char_16", info.initial_char[15].c_str());
ret = insertReq.Execute(rst);
return ret;
}
int CIndexTableManager::InitServer(const SDTCHost &dtchost) {
string _MasterAddress = "127.0.0.1";
stringstream ss;
uint32_t port = 0;
if (dtchost.vecRoute.size() > 0) {
SDTCroute route = dtchost.vecRoute[0];
port = route.uPort;
_MasterAddress = route.szIpadrr;
}
ss << ":" << port << "/tcp";
string master_bind_port = ss.str();
_MasterAddress.append(master_bind_port);
log_info("master address is [%s]", _MasterAddress.c_str());
server.StringKey();
server.SetTableName(dtchost.szTablename.c_str());
server.SetAddress(_MasterAddress.c_str());
server.SetMTimeout(300);
int ret;
if ((ret = server.Ping()) != 0 && ret != -DTC::EC_TABLE_MISMATCH) {
log_error("ping server[%s] failed, err: %d", _MasterAddress.c_str(), ret);
return -1;
}
return ret;
}
bool CIndexTableManager::DeleteIndex(std::string word, const std::string& doc_id, uint32_t doc_version, uint32_t field){
DTC::Server* dtc_server = &server;
if (NULL == dtc_server) {
log_error("dtc_server is null !");
return false;
}
dtc_server->SetAccessKey(gen_access_key(doc_id));
DTC::DeleteRequest delReq(dtc_server);
int ret = delReq.SetKey(word.c_str());
ret |= delReq.EQ("doc_id", doc_id.c_str());
ret |= delReq.EQ("doc_version", doc_version);
ret |= delReq.EQ("field", field);
DTC::Result rst;
ret = delReq.Execute(rst);
if(ret != 0)
{
log_error("delete request error! errcode %d,errmsg %s, errfrom %s", ret, rst.ErrorMessage(), rst.ErrorFrom());
return false;
}
return true;
}
int CIndexTableManager::delete_snapshot_dtc(string &doc_id, uint32_t appid, Json::Value &res){
int ret;
DTC::Server* dtc_server = &server;
if(NULL == dtc_server){
log_error("snapshot server connect error!");
return RT_ERROR_GET_SNAPSHOT;
}
dtc_server->SetAccessKey(gen_access_key(doc_id));
DTC::DeleteRequest deleteReq(dtc_server);
ret = deleteReq.SetKey(gen_dtc_key_string(appid, "10", doc_id).c_str());
DTC::Result rst;
ret = deleteReq.Execute(rst);
if (ret != 0)
{
log_error("delete request error! ,errno %d ,errmsg %s, errfrom %s\n", ret,rst.ErrorMessage(), rst.ErrorFrom());
return RT_ERROR_DELETE_SNAPSHOT;
}
return 0;
}
int CIndexTableManager::delete_hanpin_index(string key, string doc_id) {
int ret = 0;
DTC::Server* dtcServer = &server;
if(NULL == dtcServer){
log_error("dtc server connect error!");
return RT_ERROR_GET_SNAPSHOT;
}
dtcServer->SetAccessKey(gen_access_key(doc_id));
DTC::DeleteRequest deleteReq(dtcServer);
ret = deleteReq.SetKey(key.c_str());
ret = deleteReq.EQ("doc_id", doc_id.c_str());
DTC::Result rst;
ret = deleteReq.Execute(rst);
if (ret != 0){
log_error("delete request error! ,errno %d ,errmsg %s, errfrom %s\n", ret, rst.ErrorMessage(), rst.ErrorFrom());
}
else {
log_debug("delete key = %s doc_id = %s", key.c_str(), doc_id.c_str());
}
return ret;
}
int CIndexTableManager::get_snapshot_active_doc(const UserTableContent &fields, int &doc_version, Json::Value &res){
int ret;
DTC::Server* dtc_server = &server;
if(NULL == dtc_server){
log_error("snapshot server connect error!");
return RT_ERROR_GET_SNAPSHOT;
}
DTC::Result rst;
ret = get_snapshot_execute(dtc_server,fields,rst);
if (ret != 0) {
if (ret == -110) {
rst.Reset();
ret = get_snapshot_execute(dtc_server,fields,rst);
if (ret != 0) {
log_error("get request error! errcode %d,errmsg %s, errfrom %s", ret, rst.ErrorMessage(), rst.ErrorFrom());
return RT_ERROR_GET_SNAPSHOT;
}
}
else {
log_error("get request error! errcode %d,errmsg %s, errfrom %s", ret, rst.ErrorMessage(), rst.ErrorFrom());
return RT_ERROR_GET_SNAPSHOT;
}
}
int cnt = rst.NumRows();
if (rst.NumRows() <= 0) {
return RT_NO_THIS_DOC;
}
else {
for (int i = 0; i < cnt; i++) {
rst.FetchRow();
doc_version = rst.IntValue("trans_version");
}
}
return 0;
}
int CIndexTableManager::do_insert_index(map<string, item> &word_map, uint64_t app_id,int doc_version,int field,Json::Value &res) {
int ret;
map<string, item>::iterator map_iter = word_map.begin();
for (; map_iter != word_map.end(); map_iter++) {
string key = gen_dtc_key_string(app_id, "00", map_iter->first);
item it = map_iter->second;
ret = insert_index_dtc(key,it,field,doc_version,res);
log_debug("key = %s,doc_vesion = %d,docid = %s\n",key.c_str(),doc_version,it.doc_id.c_str());
if(ret != 0)
return RT_ERROR_INSERT_INDEX_DTC;
}
return 0;
}
int CIndexTableManager::insert_index_dtc(string key, struct item &it, u_int8_t field_type, int doc_version, Json::Value &res){
int ret = 0;
DTC::Server* dtcServer = &server;
res[field_type].append(key);
char tmp[41] = { '0' };
snprintf(tmp, sizeof(tmp), "%40s", it.doc_id.c_str());
dtcServer->SetAccessKey(tmp);
DTC::Result rst;
ret = insert_index_execute(dtcServer, key, it, field_type, doc_version, rst);
if (ret != 0)
{
log_error("insert request error! ,errno %d ,errmsg %s, errfrom %s\n", ret,rst.ErrorMessage(), rst.ErrorFrom());
return -1;
}
return 0;
}
int CIndexTableManager::do_insert_intelligent(string key, string doc_id, string word, const vector<IntelligentInfo> & info_vec, int doc_version) {
int ret = 0;
DTC::Server* dtcServer = &server;
char tmp[41] = { '0' };
snprintf(tmp, sizeof(tmp), "%40s", doc_id.c_str());
dtcServer->SetAccessKey(tmp);
vector<IntelligentInfo>::const_iterator iter = info_vec.begin();
for (; iter != info_vec.end(); iter++) {
IntelligentInfo info = *iter;
DTC::Result rst;
ret = insert_intelligent_execute(dtcServer, key, doc_id, word, info, rst, doc_version);
if (ret != 0)
{
log_error("insert request error! ,errno %d ,errmsg %s, errfrom %s\n", ret, rst.ErrorMessage(), rst.ErrorFrom());
return -1;
}
}
return 0;
}
int CIndexTableManager::update_sanpshot_dtc(const UserTableContent &fields,int doc_version,int trans_version,int &affected_rows){
int ret = 0;
DTC::Server* dtc_server = &server;
if(NULL == dtc_server){
log_error("snapshot server connect error!");
return RT_ERROR_GET_SNAPSHOT;
}
dtc_server->SetAccessKey(gen_access_key(fields.doc_id));
DTC::UpdateRequest updateReq(dtc_server);
ret = updateReq.SetKey(gen_dtc_key_string(fields.appid, "10", fields.doc_id).c_str());
updateReq.Set("doc_version", doc_version);
if(fields.content.length() > 0)
updateReq.Set("extend", fields.content.c_str());
updateReq.Set("weight",fields.weight);
updateReq.Set("created_time",fields.publish_time);
updateReq.EQ("trans_version", trans_version);
DTC::Result rst;
ret = updateReq.Execute(rst);
if (ret != 0)
{
log_error("updateReq error! ,errno %d ,errmsg %s, errfrom %s\n", ret,rst.ErrorMessage(), rst.ErrorFrom());
return RT_ERROR_UPDATE_SNAPSHOT;
}
affected_rows = rst.AffectedRows();
return ret;
}
int CIndexTableManager::update_sanpshot_dtc(uint32_t appid, string doc_id, int trans_version){
DTC::Server* dtc_server = &server;
if(NULL == dtc_server){
log_error("snapshot server connect error!");
return RT_ERROR_GET_SNAPSHOT;
}
dtc_server->SetAccessKey(gen_access_key(doc_id));
DTC::UpdateRequest updateReq(dtc_server);
int ret = updateReq.SetKey(gen_dtc_key_string(appid, "10", doc_id).c_str());
updateReq.Set("trans_version", trans_version - 1);
updateReq.EQ("trans_version", trans_version);
DTC::Result rst;
ret = updateReq.Execute(rst);
if (ret != 0)
{
log_error("delete request error! ,errno %d ,errmsg %s, errfrom %s\n", ret,rst.ErrorMessage(), rst.ErrorFrom());
return RT_ERROR_DELETE_SNAPSHOT;
}
return 0;
}
int CIndexTableManager::update_snapshot_version(const UserTableContent &fields,int doc_version,int &affected_rows){
int ret = 0;
DTC::Server* dtc_server = &server;
if(NULL == dtc_server){
log_error("snapshot server connect error!");
return RT_ERROR_GET_SNAPSHOT;
}
dtc_server->SetAccessKey(gen_access_key(fields.doc_id));
DTC::UpdateRequest updateReq(dtc_server);
ret = updateReq.SetKey(gen_dtc_key_string(fields.appid, "10", fields.doc_id).c_str());
updateReq.Set("trans_version", doc_version);
updateReq.EQ("trans_version", doc_version - 1);
DTC::Result rst;
ret = updateReq.Execute(rst);
if (ret != 0)
{
log_error("updateReq error! ,errno %d ,errmsg %s, errfrom %s\n", ret,rst.ErrorMessage(), rst.ErrorFrom());
return RT_ERROR_UPDATE_SNAPSHOT;
}
affected_rows = rst.AffectedRows();
return ret;
}
int CIndexTableManager::insert_snapshot_version(const UserTableContent &fields,int doc_version){
int ret = 0;
DTC::Server* dtc_server = &server;
if(NULL == dtc_server){
log_error("snapshot server connect error!");
return RT_ERROR_GET_SNAPSHOT;
}
dtc_server->SetAccessKey(gen_access_key(fields.doc_id));
DTC::InsertRequest insertReq(dtc_server);
ret = insertReq.SetKey(gen_dtc_key_string(fields.appid, "10", fields.doc_id).c_str());
insertReq.Set("doc_version", 0);
insertReq.Set("trans_version", doc_version);
insertReq.Set("doc_id", fields.doc_id.c_str());
DTC::Result rst;
ret = insertReq.Execute(rst);
if (ret != 0)
{
log_error("insertReq error! ,errno %d ,errmsg %s, errfrom %s\n", ret,rst.ErrorMessage(), rst.ErrorFrom());
return RT_ERROR_INSERT_SNAPSHOT;
}
return ret;
}
int CIndexTableManager::update_docid_index_dtc(const string & invert_keys, const string & doc_id, uint32_t appid, int doc_version)
{
int ret = 0;
DTC::Server* dtc_server = &server;
if (NULL == dtc_server) {
log_error("snapshot server connect error!");
return RT_ERROR_GET_SNAPSHOT;
}
dtc_server->SetAccessKey(gen_access_key(doc_id));
DTC::UpdateRequest updateReq(dtc_server);
ret = updateReq.SetKey(gen_dtc_key_string(appid, "20", doc_id).c_str());
updateReq.Set("doc_version", doc_version);
if (invert_keys.length() > 0)
updateReq.Set("extend", invert_keys.c_str());
DTC::Result rst;
ret = updateReq.Execute(rst);
if (ret != 0)
{
log_error("updateReq error! ,errno %d ,errmsg %s, errfrom %s\n", ret, rst.ErrorMessage(), rst.ErrorFrom());
return RT_ERROR_UPDATE_SNAPSHOT;
}
return ret;
}
int CIndexTableManager::insert_docid_index_dtc(const string & invert_keys, const string & doc_id, uint32_t appid, int doc_version)
{
int ret = 0;
DTC::Server* dtc_server = &server;
if (NULL == dtc_server) {
log_error("snapshot server connect error!");
return RT_ERROR_GET_SNAPSHOT;
}
dtc_server->SetAccessKey(gen_access_key(doc_id));
DTC::InsertRequest insertReq(dtc_server);
ret = insertReq.SetKey(gen_dtc_key_string(appid, "20", doc_id).c_str());
insertReq.Set("doc_id", doc_id.c_str());
insertReq.Set("doc_version", doc_version);
if (invert_keys.length() > 0)
insertReq.Set("extend", invert_keys.c_str());
DTC::Result rst;
ret = insertReq.Execute(rst);
if (ret != 0)
{
log_error("updateReq error! ,errno %d ,errmsg %s, errfrom %s\n", ret, rst.ErrorMessage(), rst.ErrorFrom());
return RT_ERROR_INSERT_INDEX_DTC;
}
return ret;
}
int CIndexTableManager::insert_union_index_dtc(const string & union_key, const string & doc_id, uint32_t appid, int doc_version)
{
int ret = 0;
DTC::Server* dtc_server = &server;
if (NULL == dtc_server) {
log_error("snapshot server connect error!");
return RT_ERROR_GET_SNAPSHOT;
}
dtc_server->SetAccessKey(gen_access_key(doc_id));
DTC::InsertRequest insertReq(dtc_server);
ret = insertReq.SetKey(gen_dtc_key_string(appid, "00", union_key).c_str());
insertReq.Set("doc_id", doc_id.c_str());
insertReq.Set("doc_version", doc_version);
insertReq.Set("trans_version", doc_version);
insertReq.Set("created_time", time(NULL));
insertReq.Set("word_freq", 1);
insertReq.Set("weight", 1);
DTC::Result rst;
ret = insertReq.Execute(rst);
if (ret != 0)
{
log_error("updateReq error! ,errno %d ,errmsg %s, errfrom %s\n", ret, rst.ErrorMessage(), rst.ErrorFrom());
return RT_ERROR_INSERT_INDEX_DTC;
}
return ret;
}
int CIndexTableManager::delete_docid_index_dtc(const string & key, const string & doc_id){
int ret = 0;
DTC::Server* dtc_server = &server;
if (NULL == dtc_server) {
log_error("snapshot server connect error!");
return RT_ERROR_GET_SNAPSHOT;
}
dtc_server->SetAccessKey(gen_access_key(doc_id));
DTC::DeleteRequest deleteReq(dtc_server);
ret = deleteReq.SetKey(key.c_str());
DTC::Result rst;
ret = deleteReq.Execute(rst);
if(ret != 0)
{
log_error("deleteReq error! ,errno %d ,errmsg %s, errfrom %s\n", ret, rst.ErrorMessage(), rst.ErrorFrom());
return RT_ERROR_DELETE_SNAPSHOT;
}
return ret;
}
bool CIndexTableManager::GetIndexData(const std::string& doc_id, uint32_t doc_version, map<uint32_t, vector<string> > &res){
DTC::Server* dtc_server = &server;
if (NULL == dtc_server) {
log_error("dtc_server is null !");
return false;
}
DTC::GetRequest getReq(dtc_server);
int ret = getReq.SetKey(doc_id.c_str());
ret |= getReq.EQ("doc_version", doc_version);
ret |= getReq.Need("extend");
DTC::Result rst;
ret = getReq.Execute(rst);
if(ret != 0)
{
log_error("get request error! errcode %d,errmsg %s, errfrom %s", ret, rst.ErrorMessage(), rst.ErrorFrom());
return false;
}
int cnt = rst.NumRows();
if(cnt <= 0)
{
log_debug("can not find any result. key:%s", doc_id.c_str());
return false;
}
rst.FetchRow();
string extend = rst.StringValue("extend");
Json::Reader reader;
Json::Value value;
if(!reader.parse(extend, value, false))
{
log_error("parse json error!\ndata:%s errors:%s\n",extend.c_str(),reader.getFormattedErrorMessages().c_str());
return false;
}
if(value.isArray()){
for(int i = 0;i < (int)value.size();i++){
Json::Value info = value[i];
if(info.isArray()){
for(int j = 0;j < (int)info.size();j++){
if(info[j].isString()){
res[i].push_back(info[j].asString());
}
}
}
}
}
return true;
}
bool CIndexTableManager::delete_index(std::string word, const std::string& doc_id, uint32_t doc_version, uint32_t field){
DTC::Server* dtc_server = &server;
if (NULL == dtc_server) {
log_error("dtc_server is null !");
return false;
}
dtc_server->SetAccessKey(gen_access_key(doc_id));
DTC::DeleteRequest delReq(dtc_server);
int ret = delReq.SetKey(word.c_str());
ret |= delReq.EQ("doc_id", doc_id.c_str());
ret |= delReq.EQ("doc_version", doc_version);
ret |= delReq.EQ("field", field);
DTC::Result rst;
ret = delReq.Execute(rst);
if(ret != 0)
{
log_error("delete request error! errcode %d,errmsg %s, errfrom %s", ret, rst.ErrorMessage(), rst.ErrorFrom());
return false;
}
return true;
}
bool CIndexTableManager::delete_intelligent(std::string key, std::string doc_id, uint32_t trans_version){
DTC::Server* intelligent_server = &server;
if(NULL == intelligent_server){
log_error("GetServer error!");
return false;
}
intelligent_server->SetAccessKey(gen_access_key(doc_id));
DTC::DeleteRequest deleteReq(intelligent_server);
int ret = deleteReq.SetKey(key.c_str());
deleteReq.EQ("doc_id", doc_id.c_str());
deleteReq.EQ("doc_version", trans_version);
DTC::Result rst;
ret = deleteReq.Execute(rst);
if (ret != 0)
{
log_error("delete request error! ,errno %d ,errmsg %s, errfrom %s\n", ret,rst.ErrorMessage(), rst.ErrorFrom());
return false;
}
return true;
}
void * DeleteTask::ProcessCycle(void * arg)
{
int statistic_period = 5;
pthread_mutex_lock(&DeleteTask::GetInstance()._Mutex);
std::vector<DeleteItem> temp_result;
int last_append_time = time(NULL);
while (!DeleteTask::GetInstance()._StopFlag) {
if (DeleteTask::GetInstance()._InfoHead == NULL) {
if(temp_result.size() != 0){
for (std::vector<DeleteItem>::iterator it = temp_result.begin(); it != temp_result.end(); it++) {
DeleteItem item = *it;
g_delIndexInstance.DeleteIndex(item.word, item.doc_id, item.doc_version, item.field);
}
temp_result.clear();
}
pthread_cond_wait(&DeleteTask::GetInstance()._NotEmpty, &DeleteTask::GetInstance()._Mutex);
continue;
}
DeleteItem *head = DeleteTask::GetInstance()._InfoHead;
DeleteTask::GetInstance()._InfoHead = DeleteTask::GetInstance()._InfoTail = NULL;
pthread_mutex_unlock(&DeleteTask::GetInstance()._Mutex);
DeleteTask::GetInstance().Coalesce(head, temp_result);
int now_time = time(NULL);
if (now_time - last_append_time >= statistic_period) {
last_append_time = now_time;
for (std::vector<DeleteItem>::iterator it = temp_result.begin(); it != temp_result.end(); it++) {
DeleteItem item = *it;
g_delIndexInstance.DeleteIndex(item.word, item.doc_id, item.doc_version, item.field);
}
temp_result.clear();
}
pthread_mutex_lock(&DeleteTask::GetInstance()._Mutex);
}
return NULL;
}
void DeleteTask::Coalesce(DeleteItem * head, std::vector<DeleteItem>& temp_result)
{
DeleteItem *p = head;
DeleteItem *q;
while (p != NULL) {
std::vector<DeleteItem>::iterator it = temp_result.begin();
for ( ; it != temp_result.end(); it++) {
if (*it == *p) {
break;
}
}
if (temp_result.size() == 0 || it == temp_result.end()) {
temp_result.push_back(*p);
}
q = p;
p = p->_Next;
delete q;
}
}
bool DeleteTask::Initialize(){
_InfoHead = _InfoTail = NULL;
return true;
}
void DeleteTask::RegisterInfo(const std::string& word, const std::string& doc_id, uint32_t doc_version, uint32_t field) {
DeleteItem *item = new DeleteItem();
if (item != NULL) {
item->word = word;
item->doc_id = doc_id;
item->doc_version = doc_version;
item->field = field;
PushReportItem(item);
}
}

View File

@ -0,0 +1,126 @@
/*
* =====================================================================================
*
* Filename: index_tbl_op.h
*
* Description: IndexConf class definition.
*
* Version: 1.0
* Created: 09/08/2020 10:02:05 PM
* Revision: none
* Compiler: gcc
*
* Author: shrewdlin, linjinming@jd.com
* Company: JD.com, Inc.
*
* =====================================================================================
*/
#ifndef INDEX_TBL_OP_H
#define INDEX_TBL_OP_H
#include "log.h"
#include "dtcapi.h"
#include "split_tool.h"
#include "index_conf.h"
#include "json/json.h"
#include "dtc_tools.h"
#include "comm.h"
class CIndexTableManager
{
public:
int InitServer(const SDTCHost &dtchost);
bool DeleteIndex(std::string word, const std::string& doc_id, uint32_t doc_version, uint32_t field);
int delete_snapshot_dtc(string &doc_id, uint32_t appid, Json::Value &res);
int delete_hanpin_index(string key, string doc_id);
int get_snapshot_active_doc(const UserTableContent &fields, int &doc_version, Json::Value &res);
int do_insert_index(map<string, item> &word_map, uint64_t app_id,int doc_version,int field,Json::Value &res);
int insert_index_dtc(string key, struct item &it, u_int8_t field_type, int doc_version, Json::Value &res);
int do_insert_intelligent(string key, string doc_id, string word, const vector<IntelligentInfo> & info_vec, int doc_version);
int update_sanpshot_dtc(const UserTableContent &fields,int doc_version,int trans_version,int &affected_rows);
int update_sanpshot_dtc(uint32_t appid, string doc_id, int trans_version);
int update_snapshot_version(const UserTableContent &fields,int doc_version,int &affected_rows);
int insert_snapshot_version(const UserTableContent &fields,int doc_version);
int update_docid_index_dtc(const string & invert_keys, const string & doc_id, uint32_t appid, int doc_version);
int insert_docid_index_dtc(const string & invert_keys, const string & doc_id, uint32_t appid, int doc_version);
int insert_union_index_dtc(const string & union_key, const string & doc_id, uint32_t appid, int doc_version);
int delete_docid_index_dtc(const string & key, const string & doc_id);
bool GetIndexData(const std::string& doc_id, uint32_t doc_version, map<uint32_t, vector<string> > &res);
bool delete_index(std::string word, const std::string& doc_id, uint32_t doc_version, uint32_t field);
bool delete_intelligent(std::string key, std::string doc_id, uint32_t trans_version);
private:
DTC::Server server;
};
extern CIndexTableManager g_IndexInstance;
extern CIndexTableManager g_delIndexInstance;
extern CIndexTableManager g_hanpinIndexInstance;
class DeleteItem {
public:
friend class DeleteTask;
DeleteItem() :_Next(NULL) {}
bool operator==(const DeleteItem& a) {
return this->word == a.word &&
this->doc_id == a.doc_id &&
this->doc_version == a.doc_version &&
this->field == a.field;
}
private:
std::string word;
std::string doc_id;
uint32_t doc_version;
uint32_t field;
DeleteItem *_Next;
};
class DeleteTask{
public:
static DeleteTask& GetInstance() {
static DeleteTask instance;
return instance;
}
bool Initialize();
void RegisterInfo(const std::string& word, const std::string& doc_id, uint32_t doc_version, uint32_t field);
private:
pthread_t _ReportThread;
pthread_cond_t _NotEmpty;
pthread_mutex_t _Mutex;
DeleteItem *_InfoHead;
DeleteItem *_InfoTail;
bool _StopFlag;
private:
static void *ProcessCycle(void *arg);
DeleteTask() {
pthread_mutex_init(&_Mutex, NULL);
pthread_cond_init(&_NotEmpty, NULL);
pthread_create(&_ReportThread, NULL, ProcessCycle, NULL);
_StopFlag = false;
}
~DeleteTask() {
_StopFlag = true;
pthread_cond_signal(&_NotEmpty);
pthread_join(_ReportThread, NULL);
}
void Coalesce(DeleteItem *head, std::vector<DeleteItem>& temp_result);
void PushReportItem(DeleteItem* item) {
pthread_mutex_lock(&_Mutex);
if (_InfoHead == NULL) {
_InfoHead = _InfoTail = item;
}
else {
_InfoTail->_Next = item;
_InfoTail = item;
}
pthread_cond_signal(&_NotEmpty);
pthread_mutex_unlock(&_Mutex);
}
};
#endif

View File

@ -0,0 +1,307 @@
/*
* =====================================================================================
*
* Filename: index_write.cc
*
* Description: IndexConf class definition.
*
* Version: 1.0
* Created: 09/08/2020 10:02:05 PM
* Revision: none
* Compiler: gcc
*
* Author: shrewdlin, linjinming@jd.com
* Company: JD.com, Inc.
*
* =====================================================================================
*/
#include "index_write.h"
#include <iostream>
#include <string>
#include <map>
#include "log.h"
#include "poll_thread.h"
#include "task_request.h"
#include "dtc_tools.h"
#include "index_clipping.h"
#include "monitor.h"
#include "chash.h"
#include "index_tbl_op.h"
#include "geohash.h"
#include "add_request_proc.h"
CTaskIndexGen::CTaskIndexGen(CPollThread * o) :
CTaskDispatcher<CTaskRequest>(o),
ownerThread(o),
output(o),
read_only(0)
{
}
CTaskIndexGen::~CTaskIndexGen()
{
}
int CTaskIndexGen::decode_request(const Json::Value & req, Json::Value & subreq, uint32_t & id, uint32_t & count)
{
if (req.isMember("table_content") && req["table_content"].isArray()) {
subreq = req["table_content"];
}
else {
return RT_NO_TABLE_CONTENT;
}
if (req.isMember("appid") && req["appid"].isInt()) {
id = req["appid"].asInt();
}
else {
return RT_NO_APPID;
}
if (req.isMember("fields_count") && req["fields_count"].isInt()) {
count = req["fields_count"].asInt();
}
else {
return RT_NO_FIELD_COUNT;
}
return 0;
}
static int decode_fields(Json::Value table_content,Json::Value &json_fields,UserTableContent &fields){
string cmd;
if(table_content.isMember("cmd") && table_content["cmd"].isString()){
cmd = table_content["cmd"].asString();
if(cmd == "add" || cmd == "update"){
if(table_content.isMember("fields") && table_content["fields"].isObject()){
json_fields = table_content["fields"];
if(json_fields.isMember("id") && (json_fields["id"].isString() || json_fields["id"].isInt())){
fields.doc_id = json_fields["id"].asString();
}else{
if(json_fields.isMember("doc_id") && json_fields["doc_id"].isString()){
fields.doc_id = json_fields["doc_id"].asString();
}else
return RT_NO_DOCID;
}
if(json_fields.isMember("weight") && json_fields["weight"].isInt()){
fields.weight = json_fields["weight"].asInt();
}else{
fields.weight = 1;
}
return RT_CMD_ADD;
}
else{
return RT_ERROR_FIELD;
}
}else if(cmd == "delete"){
json_fields = table_content["fields"];
if(json_fields.isMember("doc_id") && json_fields["doc_id"].isString()){
fields.doc_id = json_fields["doc_id"].asString();
}else if(json_fields.isMember("id") && (json_fields["id"].isString() || json_fields["id"].isInt())){
fields.doc_id = json_fields["id"].asString();
}else{
return RT_NO_DOCID;
}
return RT_CMD_DELETE;
}else{
return RT_ERROR_FIELD_CMD;
}
}
return 0;
}
int CTaskIndexGen::index_gen_process(Json::Value &req, Json::Value &res){
int doc_version = 0, old_version = 0, trans_version = 0;
uint32_t app_id, fields_count = 0;
int ret = 0;
Json::Value table_content;
if (req.isMember("read_only") && req["read_only"].isInt()) {
read_only = req["read_only"].asInt();
return 0;
}
ret = decode_request(req, table_content, app_id, fields_count);
if(ret != 0){
return ret;
}
log_debug("table_content: %s", table_content.toStyledString().c_str());
if(fields_count == 0 || fields_count != table_content.size()){
return RT_ERROR_FIELD_COUNT;
}
if(!SplitManager::Instance()->is_effective_appid(app_id)){
return RT_NO_APPID;
}
if (read_only) {
return RT_ERROR_INDEX_READONLY;
}
for(int i = 0;i < (int)table_content.size();i++){
doc_version = 0;
old_version = 0;
trans_version = 0;
UserTableContent content_fields(app_id);
Json::Value json_field;
ret = decode_fields(table_content[i], json_field, content_fields);
if(RT_CMD_ADD == ret){
ret = g_IndexInstance.get_snapshot_active_doc(content_fields, old_version, res);
if(0 == ret){
trans_version = old_version + 1;
doc_version = old_version + 1;
}else if(ret == RT_NO_THIS_DOC){
trans_version = 1;
doc_version = 1;
} else {
log_error("get_snapshot_active_doc error.");
return ret;
}
if(trans_version != 1){
// 更新快照的trans_version字段
int affected_rows = 0;
ret = g_IndexInstance.update_snapshot_version(content_fields, trans_version, affected_rows);
if(0 != ret){
log_error("doc_id[%s] update snapshot version error, continue.", content_fields.doc_id.c_str());
continue;
}
else if(affected_rows == 0){
ret = RT_UPDATE_SNAPSHOT_CONFLICT;
log_info("doc_id[%s] update snapshot conflict, continue.", content_fields.doc_id.c_str());
continue;
}
} else {
ret = g_IndexInstance.insert_snapshot_version(content_fields, trans_version);
if(0 != ret){
// 再查询一次快照
ret = g_IndexInstance.get_snapshot_active_doc(content_fields, old_version, res);
if(0 == ret){
trans_version = old_version + 1;
doc_version = old_version + 1;
int affected_rows = 0;
ret = g_IndexInstance.update_snapshot_version(content_fields, trans_version, affected_rows);
if(0 != ret){
log_error("doc_id[%s] update snapshot version error, continue.", content_fields.doc_id.c_str());
continue;
}
else if(affected_rows == 0){
ret = RT_UPDATE_SNAPSHOT_CONFLICT;
log_info("doc_id[%s] update snapshot conflict, continue.", content_fields.doc_id.c_str());
continue;
}
} else {
log_error("doc_id[%s] insert error, continue.", content_fields.doc_id.c_str());
continue;
}
}
}
InsertParam insert_param;
insert_param.appid = app_id;
insert_param.doc_id = content_fields.doc_id;
insert_param.doc_version = doc_version;
insert_param.trans_version = trans_version;
AddReqProc add_req_proc(json_field, insert_param);
ret = add_req_proc.do_insert_index(content_fields);
if(0 != ret){
return ret;
}
}
else if(RT_CMD_DELETE == ret){
// 从hanpin_index_data中删除
vector<uint32_t> field_vec;
SplitManager::Instance()->getHanpinField(content_fields.appid, field_vec);
vector<uint32_t>::iterator iter = field_vec.begin();
for (; iter != field_vec.end(); iter++) {
stringstream ss;
ss << content_fields.appid << "#" << *iter;
ret = g_IndexInstance.delete_hanpin_index(ss.str(), content_fields.doc_id);
if (ret != 0) {
log_error("delete error! errcode %d", ret);
return ret;
}
}
ret = g_IndexInstance.get_snapshot_active_doc(content_fields, old_version,res);
if(ret != 0 && ret != RT_NO_THIS_DOC){
log_error("get_snapshot_active_doc error! errcode %d", ret);
return ret;
}
map<uint32_t, vector<string> > index_res;
g_IndexInstance.GetIndexData(gen_dtc_key_string(content_fields.appid, "20", content_fields.doc_id), old_version, index_res);
map<uint32_t, vector<string> >::iterator map_iter = index_res.begin();
for(; map_iter != index_res.end(); map_iter++){
uint32_t field = map_iter->first;
vector<string> words = map_iter->second;
for(int i = 0; i < (int)words.size(); i++){
DeleteTask::GetInstance().RegisterInfo(words[i], content_fields.doc_id, old_version, field);
}
}
ret = g_IndexInstance.delete_snapshot_dtc(content_fields.doc_id, content_fields.appid, res);//not use the doc_version curr
g_IndexInstance.delete_docid_index_dtc(gen_dtc_key_string(content_fields.appid, "20", content_fields.doc_id), content_fields.doc_id);
}
}
return ret;
}
void CTaskIndexGen::TaskNotify(CTaskRequest * curr)
{
log_debug("CTaskIndexGen::TaskNotify start");
common::CallerInfo caller_info = common::ProfilerMonitor::GetInstance().RegisterInfo(std::string("searchEngine.searchService.indexGenTask"));
//there is a race condition here:
//curr may be deleted during process (in task->ReplyNotify())
int ret;
Json::Reader reader;
Json::FastWriter writer;
Json::Value value, res;
std::string req;
res["code"] = 0;
CTaskRequest * task = curr;
if(NULL == curr){
common::ProfilerMonitor::GetInstance().FunctionError(caller_info);
common::ProfilerMonitor::GetInstance().RegisterInfoEnd(caller_info);
return;
}
if(SERVICE_INDEXGEN != task->GetReqCmd()){
res["code"] = RT_ERROR_SERVICE_TYPE;
res["reqcmd"] = task->GetReqCmd();
res["message"] = "service type wrong! need 106";
goto end;
}
req = task->buildRequsetString();
log_debug("recv:%s\n",req.c_str());
if(!reader.parse(req,value,false))
{
log_error("parse json error!\ndata:%s errors:%s\n",req.c_str(),reader.getFormattedErrorMessages().c_str());
res["code"] = RT_PARSE_JSON_ERR;
res["message"] = reader.getFormattedErrorMessages();
res["data"] = req;
goto end;
}
if(!value.isObject()){
log_error("parse json error!\ndata:%s errors:%s\n",req.c_str(),reader.getFormattedErrorMessages().c_str());
res["code"] = RT_PARSE_JSON_ERR;
res["message"] = "it's not a json";
res["data"] = req;
goto end;
}
ret = index_gen_process(value, res);
if(0 != ret){
res["code"] = ret;
}
end:
task->setResult(writer.write(res));
task->ReplyNotify();
common::ProfilerMonitor::GetInstance().RegisterInfoEnd(caller_info);
return;
}

View File

@ -0,0 +1,60 @@
/*
* =====================================================================================
*
* Filename: index_write.h
*
* Description: IndexConf class definition.
*
* Version: 1.0
* Created: 09/08/2020 10:02:05 PM
* Revision: none
* Compiler: gcc
*
* Author: shrewdlin, linjinming@jd.com
* Company: JD.com, Inc.
*
* =====================================================================================
*/
#ifndef INDEX_GEN_H_
#define INDEX_GEN_H_
#include <set>
#include <vector>
#include <sstream>
#include "request_base.h"
#include "index_conf.h"
#include "dtcapi.h"
#include "comm.h"
#include "split_manager.h"
using namespace std;
class CPollThread;
class CTaskRequest;
class SplitManager;
class CTaskIndexGen : public CTaskDispatcher<CTaskRequest>
{
private:
CPollThread * ownerThread;
CRequestOutput<CTaskRequest> output;
int read_only;
private:
int decode_request(const Json::Value &req, Json::Value &subreq, uint32_t &id, uint32_t &count);
public:
CTaskIndexGen(CPollThread * o);
virtual ~CTaskIndexGen();
int index_gen_process(Json::Value &req,Json::Value &res);
int pre_process(void);
inline void BindDispatcher(CTaskDispatcher<CTaskRequest> *p){
output.BindDispatcher(p);
}
virtual void TaskNotify(CTaskRequest * curr);
};
#endif /* INDEX_GEN_H_ */

View File

@ -0,0 +1,281 @@
/*
* =====================================================================================
*
* Filename: main.cc
*
* Description: Entrance.
*
* Version: 1.0
* Created: 09/08/2020 10:02:05 PM
* Revision: none
* Compiler: gcc
*
* Author: shrewdlin, linjinming@jd.com
* Company: JD.com, Inc.
*
* =====================================================================================
*/
#include "agent_listen_pkg.h"
#include <unistd.h>
#include <fcntl.h>
#include <string.h>
#include <stdio.h>
#include <iostream>
#include <fstream>
#include "stat_index.h"
#include "task_request.h"
#include "config.h"
#include "poll_thread.h"
#include "log.h"
#include "pipetask.h"
#include "memcheck.h"
#include "agent_process.h"
#include "index_conf.h"
#include "index_write.h"
#include "snapshot_service.h"
#include "top_index_service.h"
#include "image_service.h"
#include "dtc_tools.h"
#include "comm.h"
#include "version.h"
#include "monitor.h"
#include "index_tbl_op.h"
#define STRING_HELPER(str) #str
#define STRING(x) STRING_HELPER(x)
#define VERSION_MAJOR 1
#define VERSION_MINOR 1
#define VERSION_BUILD 0
#define MAIN_VERSION \
STRING(VERSION_MAJOR) "." \
STRING(VERSION_MINOR) "." \
STRING(VERSION_BUILD)
#ifndef GIT_VERSION
#define GIT_VERSION 0000000
#endif
#define INDEX_VERSION_STR MAIN_VERSION "-" STRING(GIT_VERSION)
volatile int stop = 0;
int background = 1;
pthread_t mainthreadid;
const char progname[] = "index_write";
const char *conf_filename = "../conf/index_write.conf";
int gMaxConnCnt;
static CAgentListenPkg *agentListener;
static CAgentProcess *agentProcess;
static CTaskIndexGen *indexGen;
static CTaskTopIndex *topIndex;
static CTaskSnapShot *snapShot;
static CTaskImage *image;
//single thread version
static CPollThread *workerThread;
static int Startup_Thread()
{
int ret = 0;
indexGen = NULL;
topIndex = NULL;
snapShot = NULL;
workerThread = new CPollThread("worker");
if (workerThread->InitializeThread() == -1){
log_error("InitializeThread error");
return -1;
}
agentProcess = new CAgentProcess(workerThread);
if(NULL == agentProcess){
return -1;
}
switch(IndexConf::Instance()->GetGlobalConfig().service_type)
{
default:
case SERVICE_INDEXGEN:
indexGen = new CTaskIndexGen(workerThread);
if(NULL == indexGen)
return -1;
agentProcess->BindDispatcher(indexGen);
break;
case SERVICE_TOPINDEX:
topIndex = new CTaskTopIndex(workerThread);
if(NULL == topIndex)
return -1;
ret = topIndex->pre_process();
if(ret != 0){
return -1;
}
agentProcess->BindDispatcher(topIndex);
break;
case SERVICE_SNAPSHOT:
snapShot = new CTaskSnapShot(workerThread);
if(NULL == snapShot)
return -1;
ret = snapShot->pre_process();
if(ret != 0){
return -1;
}
agentProcess->BindDispatcher(snapShot);
break;
case SERVICE_PIC:
image = new CTaskImage(workerThread);
if(NULL == image)
return -1;
ret = image->pre_process();
if(0!= ret)
return -1;
agentProcess->BindDispatcher(image);
break;
}
agentListener = new CAgentListenPkg();
if(agentListener->Bind(IndexConf::Instance()->GetGlobalConfig().listen_addr.c_str(), agentProcess, 0) < 0){
log_error("bind addr error");
return -1;
}
workerThread->RunningThread();
agentListener->Run();
return 0;
}
int configInit(void)
{
if (!IndexConf::Instance()->ParseConf(conf_filename)) {
cout << "load conf file error " << conf_filename << endl;
return -RT_PARSE_CONF_ERR;
}
SGlobalIndexConfig &globalconfig = IndexConf::Instance()->GetGlobalConfig();
stat_init_log_(progname, globalconfig.logPath.c_str());
stat_set_log_level_(globalconfig.iLogLevel);
log_info("%s v%s: log level %d starting....", progname, INDEX_VERSION_STR, globalconfig.iLogLevel);
return 0;
}
static void sigterm_handler(int signo)
{
stop = 1;
}
void index_create_pid(string str_pid_file) {
ofstream pid_file;
pid_file.open(str_pid_file.c_str(), ios::out | ios::trunc);
if (pid_file.is_open()) {
pid_file << getpid();
pid_file.close();
}
else {
log_error("open pid file error. file:%s, errno:%d, errmsg:%s.",
str_pid_file.c_str(), errno, strerror(errno));
}
}
static int DaemonStart()
{
struct sigaction sa;
sigset_t sset;
memset(&sa, 0, sizeof(sa));
sa.sa_handler = sigterm_handler;
sigaction(SIGINT, &sa, NULL);
sigaction(SIGTERM, &sa, NULL);
sigaction(SIGQUIT, &sa, NULL);
sigaction(SIGHUP, &sa, NULL);
signal(SIGPIPE, SIG_IGN);
signal(SIGCHLD, SIG_IGN);
sigemptyset(&sset);
sigaddset(&sset, SIGTERM);
sigaddset(&sset, SIGSEGV);
sigaddset(&sset, SIGBUS);
sigaddset(&sset, SIGABRT);
sigaddset(&sset, SIGILL);
sigaddset(&sset, SIGCHLD);
sigaddset(&sset, SIGFPE);
sigprocmask(SIG_UNBLOCK, &sset, &sset);
if(!IndexConf::Instance()->GetGlobalConfig().background){
background = 0;
}
int ret = background ? daemon (1, 1) : 0;
mainthreadid = pthread_self();
return ret;
}
void ServicePostRun(string str_pid_file) {
IndexConf::Instance()->Destroy();
SplitManager::Instance()->Destroy();
unlink(str_pid_file.c_str());
DTCTools::Destroy();
}
int main()
{
CThread *mainThread;
NEW(CThread("main", CThread::ThreadTypeProcess), mainThread);
if(mainThread != NULL) {
mainThread->InitializeThread();
}
if(configInit() != 0){
log_error("config init error");
return -1;
}
if (DaemonStart () < 0){
log_error("DaemonStart error");
return -1;
}
index_create_pid(IndexConf::Instance()->GetGlobalConfig().pid_file);
InitStat(IndexConf::Instance()->GetGlobalConfig().service_name.c_str());
SDTCHost &dtchost = IndexConf::Instance()->GetDTCIndexConfig();
if (g_IndexInstance.InitServer(dtchost) != 0) {
log_error("dtc init error");
return -1;
}
if (g_delIndexInstance.InitServer(dtchost) != 0) {
log_error("dtc init error");
return -1;
}
SDTCHost &indexHost = IndexConf::Instance()->GetDTCIntelligentConfig();
if (g_hanpinIndexInstance.InitServer(indexHost) != 0) {
log_error("dtc init error");
return -1;
}
if (!SplitManager::Instance()->Init(IndexConf::Instance()->GetGlobalConfig())) {
log_error("g_splitManager init error");
return -1;
}
//start statistic thread.
statmgr.StartBackgroundThread();
//ump monitor initialize
common::ProfilerMonitor::GetInstance().Initialize();
DeleteTask::GetInstance().Initialize();
if(Startup_Thread() < 0){
stop = 1;
}
log_info("%s v%s: running...", progname, INDEX_VERSION_STR);
while(!stop){
sleep(10);
}
log_info("%s v%s: stoppping...", progname, INDEX_VERSION_STR);
if(workerThread){
workerThread->interrupt();
}
//DELETE(workerThread);
DELETE(agentListener);
DELETE(indexGen);
DELETE(agentProcess);
ServicePostRun(IndexConf::Instance()->GetGlobalConfig().pid_file);
statmgr.StopBackgroundThread();
log_info("%s v%s: stopped", progname, INDEX_VERSION_STR);
return 0;
}
/* ends here */

View File

@ -0,0 +1,278 @@
/*
* =====================================================================================
*
* Filename: snapshot_service.cc
*
* Description: CTaskSnapShot class definition.
*
* Version: 1.0
* Created: 09/08/2020 10:02:05 PM
* Revision: none
* Compiler: gcc
*
* Author: shrewdlin, linjinming@jd.com
* Company: JD.com, Inc.
*
* =====================================================================================
*/
#include <iostream>
#include <string>
#include "log.h"
#include "poll_thread.h"
#include "task_request.h"
#include "dtc_tools.h"
#include "comm.h"
#include "snapshot_service.h"
#include "monitor.h"
#include "chash.h"
CTaskSnapShot::CTaskSnapShot(CPollThread * o) :
CTaskDispatcher<CTaskRequest>(o),
ownerThread(o),
output(o)
{
}
CTaskSnapShot::~CTaskSnapShot()
{
}
static int decode_request(const Json::Value &req, Json::Value &subreq, int &id, int &count){
if(req.isMember("table_content") && req["table_content"].isArray()){
subreq = req["table_content"];
}else{
return RT_NO_TABLE_CONTENT;
}
if(req.isMember("appid") && req["appid"].isInt()){
id = req["appid"].asInt();
}else{
return RT_NO_APPID;
}
if(req.isMember("fields_count") && req["fields_count"].isInt()){
count = req["fields_count"].asInt();
}else{
return RT_NO_FIELD_COUNT;
}
return 0;
}
static int decode_field(Json::Value table_content,UserTableContent &fields){
string cmd;
if(table_content.isMember("cmd") && table_content["cmd"].isString()){
cmd = table_content["cmd"].asString();
if(cmd == "snapshot"){
if(table_content.isMember("fields") && table_content["fields"].isObject()){
Json::Value field = table_content["fields"];
if(field.isMember("doc_id") && field["doc_id"].isString()){
fields.doc_id = field["doc_id"].asString();
}
if (field.isMember("top") && field["top"].isInt()) {
fields.top = field["top"].asInt();
}
}
return RT_CMD_GET;
}else if(cmd == "update_snapshot"){
if(table_content.isMember("fields") && table_content["fields"].isObject()){
Json::Value field = table_content["fields"];
if(field.isMember("doc_id") && field["doc_id"].isString()){
fields.doc_id = field["doc_id"].asString();
}
if (field.isMember("top") && field["top"].isInt()) {
fields.top = field["top"].asInt();
}
if(field.isMember("weight") && field["weight"].isInt()){
fields.weight = field["weight"].asInt();
}
}
return RT_CMD_UPDATE;
}else{
return RT_ERROR_FIELD_CMD;
}
}
return 0;
}
static int get_snapshot_execute(DTC::Server* dtc_server,const UserTableContent &fields,DTC::Result &rst){
DTC::GetRequest getReq(dtc_server);
int ret = 0;
string top_tag = "10";
if (fields.top == 1) {
top_tag = "11";
}
ret = getReq.SetKey(gen_dtc_key_string(fields.appid, top_tag, fields.doc_id).c_str());
ret = getReq.Need("extend");
ret = getReq.Need("created_time");
ret = getReq.Execute(rst);
return ret;
}
int CTaskSnapShot::get_snapshot_dtc(UserTableContent &fields,Json::Value &res){
int ret;
DTC::Server* dtc_server = index_servers.GetServer();
if(NULL == dtc_server){
log_error("snapshot server connect error!");
return RT_ERROR_GET_SNAPSHOT;
}
DTC::Result rst;
ret = get_snapshot_execute(dtc_server,fields,rst);
if (ret != 0) {
if (ret == -110) {
rst.Reset();
ret = get_snapshot_execute(dtc_server,fields,rst);
if (ret != 0) {
log_error("get request error! errcode %d,errmsg %s, errfrom %s", ret, rst.ErrorMessage(), rst.ErrorFrom());
res[MESSAGE] = rst.ErrorMessage();
return RT_ERROR_GET_SNAPSHOT;
}
}
else {
log_error("get request error! errcode %d,errmsg %s, errfrom %s", ret, rst.ErrorMessage(), rst.ErrorFrom());
res[MESSAGE] = rst.ErrorMessage();
return RT_ERROR_GET_SNAPSHOT;
}
}
int cnt = rst.NumRows();
if (rst.NumRows() <= 0) {
res["doc_id"] = fields.doc_id;
res[MESSAGE] = "no this doc";
return RT_NO_THIS_DOC;
}
else {
for (int i = 0; i < cnt; i++) {
rst.FetchRow();
fields.title = "";
fields.content = rst.StringValue("extend");
fields.publish_time = rst.IntValue("created_time");
fields.author = "";
if(fields.title.length() > 0 && fields.content.length() > 0)
break;
}
}
return 0;
}
int CTaskSnapShot::pre_process(void){
DTCTools *dtc_tools = DTCTools::Instance();
dtc_tools->init_servers(index_servers, IndexConf::Instance()->GetDTCIndexConfig());
return 0;
}
int CTaskSnapShot::update_sanpshot_dtc(const UserTableContent &fields,Json::Value &res){
int ret = 0;
DTC::Server* dtc_server = index_servers.GetServer();
if(NULL == dtc_server){
log_error("snapshot server connect error!");
return RT_ERROR_GET_SNAPSHOT;
}
string top_tag = "10";
if (fields.top == 1) {
top_tag = "11";
}
DTC::UpdateRequest updateReq(dtc_server);
ret = updateReq.SetKey(gen_dtc_key_string(fields.appid, top_tag, fields.doc_id).c_str());
updateReq.Set("weight",fields.weight);
DTC::Result rst;
ret = updateReq.Execute(rst);
if (ret != 0)
{
log_error("updateReq error! ,errno %d ,errmsg %s, errfrom %s\n", ret,rst.ErrorMessage(), rst.ErrorFrom());
return RT_ERROR_UPDATE_SNAPSHOT;
}
return ret;
}
int CTaskSnapShot::snapshot_process(Json::Value &req,Json::Value &res){
int app_id,fields_count = 0,ret = 0;
Json::Value table_content;
ret = decode_request(req, table_content, app_id,fields_count);
if(ret != 0){
return ret;
}
if(fields_count != 1 || fields_count != (int)table_content.size()){
res["message"] = "fields_count and table size must be 1";
return RT_ERROR_FIELD_COUNT;
}
UserTableContent content_fields(app_id);
ret = decode_field(table_content[0],content_fields);
if(RT_CMD_GET == ret && content_fields.doc_id.length() > 0){
ret = get_snapshot_dtc(content_fields,res);
if(0 == ret){
res["doc_id"] = content_fields.doc_id;
res["title"] = content_fields.title;
res["content"] = content_fields.content;
res["author"] = content_fields.author;
res["publish_time"] = content_fields.publish_time;
}
}else if(RT_CMD_UPDATE == ret && content_fields.doc_id.length() > 0){
ret = update_sanpshot_dtc(content_fields,res);
}
return ret;
}
void CTaskSnapShot::TaskNotify(CTaskRequest * curr)
{
log_debug("CTaskSnapShot::TaskNotify start");
common::CallerInfo caller_info = common::ProfilerMonitor::GetInstance().RegisterInfo(std::string("searchEngine.searchService.snapshotTask"));
//there is a race condition here:
//curr may be deleted during process (in task->ReplyNotify())
int ret;
Json::Reader reader;
Json::FastWriter writer;
Json::Value value, res;
std::string req;
res["code"] = 0;
CTaskRequest * task = curr;
if(NULL == curr){
common::ProfilerMonitor::GetInstance().FunctionError(caller_info);
common::ProfilerMonitor::GetInstance().RegisterInfoEnd(caller_info);
return;
}
if(SERVICE_SNAPSHOT != task->GetReqCmd()){
res["code"] = RT_ERROR_SERVICE_TYPE;
res["reqcmd"] = task->GetReqCmd();
res["message"] = "service type wrong! need 108";
goto end;
}
req = task->buildRequsetString();
log_debug("recv:%s\n",req.c_str());
if(!reader.parse(req,value,false))
{
log_error("parse json error!\ndata:%s errors:%s\n",req.c_str(),reader.getFormattedErrorMessages().c_str());
res["code"] = RT_PARSE_JSON_ERR;
res["message"] = reader.getFormattedErrorMessages();
res["data"] = req;
goto end;
}
if(!value.isObject()){
log_error("parse json error!\ndata:%s errors:%s\n",req.c_str(),reader.getFormattedErrorMessages().c_str());
res["code"] = RT_PARSE_JSON_ERR;
res["message"] = "it's not a json";
res["data"] = req;
goto end;
}
ret = snapshot_process(value,res);
if(0 != ret){
res["code"] = ret;
}
end:
task->setResult(writer.write(res));
task->ReplyNotify();
common::ProfilerMonitor::GetInstance().RegisterInfoEnd(caller_info);
return;
}

View File

@ -0,0 +1,53 @@
/*
* =====================================================================================
*
* Filename: snapshot_service.h
*
* Description: CTaskSnapShot class definition.
*
* Version: 1.0
* Created: 09/08/2020 10:02:05 PM
* Revision: none
* Compiler: gcc
*
* Author: shrewdlin, linjinming@jd.com
* Company: JD.com, Inc.
*
* =====================================================================================
*/
#ifndef SNAPSHOT_SERVICE_H_
#define SNAPSHOT_SERVICE_H_
#include "request_base.h"
#include "index_conf.h"
#include "dtcapi.h"
#include "split_manager.h"
using namespace std;
class CTaskSnapShot : public CTaskDispatcher<CTaskRequest>
{
private:
CPollThread * ownerThread;
CRequestOutput<CTaskRequest> output;
DTC::DTCServers index_servers;
private:
int get_snapshot_dtc(UserTableContent &fields,Json::Value &res);
int update_sanpshot_dtc(const UserTableContent &fields,Json::Value &res);
public:
CTaskSnapShot(CPollThread * o);
virtual ~CTaskSnapShot();
int pre_process(void);
int snapshot_process(Json::Value &req,Json::Value &res);
inline void BindDispatcher(CTaskDispatcher<CTaskRequest> *p)
{
output.BindDispatcher(p);
}
virtual void TaskNotify(CTaskRequest * curr);
};
#endif /* SNAPSHOT_SERVICE_H_ */

View File

@ -0,0 +1,335 @@
/*
* =====================================================================================
*
* Filename: split_manager.cc
*
* Description: SplitManager class definition.
*
* Version: 1.0
* Created: 09/08/2020 10:02:05 PM
* Revision: none
* Compiler: gcc
*
* Author: shrewdlin, linjinming@jd.com
* Company: JD.com, Inc.
*
* =====================================================================================
*/
#include "split_manager.h"
#include "log.h"
#include "stem.h"
#include "comm.h"
#include <string>
#include <sstream>
#include <algorithm>
#include <fstream>
#include <iostream>
#include <map>
#include <math.h>
#include <sys/time.h>
using namespace std;
typedef pair<string, int> PAIR;
struct CmpByValue {
bool operator()(const PAIR& lhs, const PAIR& rhs) {
return lhs.second > rhs.second;
}
};
SplitManager::SplitManager() {
stop_word_set.clear();
}
SplitManager::~SplitManager() {
}
static int32_t ToInt(const char* str) {
if (NULL != str)
return atoi(str);
else
return 0;
}
static string ToString(const char* str) {
if (NULL == str)
return "";
else
return str;
}
bool SplitManager::fetch_tbinfo_from_mysql_to_map(){
ifstream app_filed_infile;
app_filed_infile.open("../conf/app_field_define.txt");
if (app_filed_infile.is_open() == false) {
log_error("open file error: ../conf/app_field_define.txt");
return false;
}
string str;
while (getline(app_filed_infile, str))
{
vector<string> str_vec = splitEx(str, "\t");
if (str_vec.size() >= 11) {
struct table_info tbinfo;
int32_t row_index = 1;
uint32_t appid = ToInt(str_vec[row_index++].c_str());
string field_name = ToString(str_vec[row_index++].c_str());
tbinfo.is_primary_key = ToInt(str_vec[row_index++].c_str());
tbinfo.field_type = ToInt(str_vec[row_index++].c_str());
tbinfo.index_tag = ToInt(str_vec[row_index++].c_str());
tbinfo.snapshot_tag = ToInt(str_vec[row_index++].c_str());
tbinfo.segment_tag = ToInt(str_vec[row_index++].c_str());
tbinfo.field_value = ToInt(str_vec[row_index++].c_str());
row_index++;
tbinfo.segment_feature = ToInt(str_vec[row_index++].c_str());
if (str_vec.size() >= 12){
// union_key的格式是27,1,26数字代表的是field对应的value值
tbinfo.index_info = ToString(str_vec[row_index].c_str());
log_debug("union key[%s]", tbinfo.index_info.c_str());
}
log_debug("appid: %d, field_name: %s", appid, field_name.c_str());
tableDefine[appid][field_name] = tbinfo;
}
}
log_debug("tableDefine size: %d", (int)tableDefine.size());
app_filed_infile.close();
return true;
}
bool SplitManager::Init(const SGlobalIndexConfig &global_cfg) {
bool ret = seg.Init3(global_cfg.trainingPath,global_cfg.sWordsPath);
if (ret == false) {
log_error("seg init error.");
return false;
}
ifstream inf;
string s;
string word;
split_mode = global_cfg.sSplitMode;
//load stop words
inf.open(global_cfg.stopWordsPath.c_str());
if (inf.is_open() == false) {
printf("open file error: %s.\n", "./stop_words.dict");
return false;
}
while (getline(inf, s)) {
stop_word_set.insert(s);
}
inf.close();
log_info("load %d words from stop_words.dict",(int)stop_word_set.size());
string str;
ifstream phonetic_infile;
uint32_t phonetic_id = 0;
uint32_t character_id = 0;
string phonetic;
string charact;
phonetic_infile.open(global_cfg.sPhoneticPath.c_str());
if (phonetic_infile.is_open() == false) {
log_error("open file error: %s.", global_cfg.sPhoneticPath.c_str());
return false;
}
while (getline(phonetic_infile, str))
{
vector<string> str_vec = splitEx(str, "\t");
if (str_vec.size() == 2) {
phonetic_id = atoi(str_vec[0].c_str());
phonetic = str_vec[1];
phonetic_map[phonetic] = phonetic_id;
}
}
phonetic_infile.close();
ifstream phonetic_base_infile;
phonetic_base_infile.open(global_cfg.sPhoneticBasePath.c_str());
if (phonetic_base_infile.is_open() == false) {
log_error("open file error: %s.", global_cfg.sPhoneticBasePath.c_str());
return false;
}
while (getline(phonetic_base_infile, str))
{
vector<string> str_vec = splitEx(str, "\t");
if (str_vec.size() == 2) {
charact = str_vec[0];
phonetic = str_vec[1];
charact_phonetic_map.insert(make_pair(charact, phonetic));
}
}
phonetic_base_infile.close();
ifstream character_infile;
character_infile.open(global_cfg.sCharacterPath.c_str());
if (character_infile.is_open() == false) {
log_error("open file error: %s.", global_cfg.sCharacterPath.c_str());
return false;
}
while (getline(character_infile, str))
{
vector<string> str_vec = splitEx(str, "\t");
if (str_vec.size() == 2) {
character_id = atoi(str_vec[0].c_str());
charact = str_vec[1];
charact_map[charact] = character_id;
}
}
character_infile.close();
log_info("load %d words from phonetic_map, %d words from charact_map", (int)phonetic_map.size(), (int)charact_map.size());
return fetch_tbinfo_from_mysql_to_map();
}
bool SplitManager::wordValid(string word, uint32_t appid, uint32_t &id) {
if(stop_word_set.find(word) != stop_word_set.end()){
log_debug("word:%s invalid,in the stop.dict",word.c_str());
return false;
}
uint64_t int_word;
string output_word = word;
WordInfo wordinfo;
if((word[0]>='a' && word[0]<='z')||(word[0]>='A' && word[0]<='Z')){
output_word = stem(word);
seg.GetWordInfo(output_word, appid, wordinfo);//bug fixed ,English need to call dtc to search the wordinfo first
}
if(0 == wordinfo.word_id && !GetWordInfo(output_word,appid,wordinfo)){
int_word = strtoull(word.c_str(),NULL,10);
if(int_word != 0){
while(int_word > MAXNUMBER){
int_word = int_word/10;
}
id = NUMBER_ID + int_word;
return true;
}
log_debug("word:%s invalid,not in the wordbase",output_word.c_str());
return false;
}else{
id = wordinfo.word_id;
}
return true;
}
bool SplitManager::GetCharactId(string charact, uint32_t &id) {
id = 0;
if (charact_map.find(charact) != charact_map.end()) {
id = charact_map[charact];
}
return true;
}
bool SplitManager::GetPhoneticId(string phonetic, uint32_t &id) {
id = 0;
if (phonetic_map.find(phonetic) != phonetic_map.end()) {
id = phonetic_map[phonetic];
}
return true;
}
vector<string> SplitManager::GetPhonetic(string charact) {
vector<string> vec;
multimap<string, string>::iterator iter;
iter = charact_phonetic_map.find(charact);
int k = 0;
for (; k < (int)charact_phonetic_map.count(charact); k++, iter++) {
vec.push_back(iter->second);
}
return vec;
}
struct table_info *SplitManager::get_table_info(uint32_t appid, string field_name){
if(tableDefine.find(appid) != tableDefine.end()){
if(tableDefine[appid].find(field_name) != tableDefine[appid].end()){
return &(tableDefine[appid][field_name]);
}
}
return NULL;
}
bool SplitManager::getHanpinField(uint32_t appid, vector<uint32_t> & field_vec) {
if (tableDefine.find(appid) != tableDefine.end()) {
map<string, table_info> stMap = tableDefine[appid];
map<string, table_info>::iterator iter = stMap.begin();
for (; iter != stMap.end(); iter++) {
table_info tInfo = iter->second;
if (tInfo.segment_tag == 3 || tInfo.segment_tag == 4) {
field_vec.push_back(tInfo.field_value);
}
}
return true;
}
return false;
}
bool SplitManager::getUnionKeyField(uint32_t appid, vector<string> & field_vec){
if (tableDefine.find(appid) != tableDefine.end()) {
map<string, table_info> stMap = tableDefine[appid];
map<string, table_info>::iterator iter = stMap.begin();
for (; iter != stMap.end(); iter++) {
table_info tInfo = iter->second;
if(tInfo.field_type == FIELD_INDEX){
field_vec.push_back(tInfo.index_info);
}
}
return true;
}
return false;
}
bool SplitManager::is_effective_appid(uint32_t appid){
if(tableDefine.find(appid) != tableDefine.end()){
return true;
}
return true;
}
void SplitManager::DeInit() {
stop_word_set.clear();
}
vector<vector<string> > SplitManager::split(string str,uint32_t appid) {
iutf8string test(str);
unsigned int t1;
struct timeval tv;
gettimeofday(&tv, NULL);
t1 = tv.tv_sec * 1000000 + tv.tv_usec;
vector<vector<string> > res_all;
seg.cut_for_search(test,appid,res_all,split_mode);
unsigned int t2;
gettimeofday(&tv, NULL);
t2 = tv.tv_sec * 1000000 + tv.tv_usec;
log_debug("split time:%u ms",(t2-t1)/1000);
return res_all;
}
vector<string> SplitManager::split(string str) {
vector<string> vec;
iutf8string utf8_str(str);
seg.cut_ngram(utf8_str, vec, utf8_str.length());
return vec;
}
string trim(string& str)
{
str.erase(0, str.find_first_not_of(" ")); // 去掉头部空格
str.erase(str.find_last_not_of(" ") + 1); // 去掉尾部空格
return str;
}
string delPrefix(string& str){
size_t pos1 = str.find_first_of("((");
size_t pos2 = str.find_last_of("))");
string res = str;
if(pos1 != string::npos && pos2 != string::npos){
res = str.substr(pos1+2, pos2-pos1-3);
}
return res;
}

View File

@ -0,0 +1,87 @@
/*
* =====================================================================================
*
* Filename: split_manager.h
*
* Description: SplitManager class definition.
*
* Version: 1.0
* Created: 09/08/2020 10:02:05 PM
* Revision: none
* Compiler: gcc
*
* Author: shrewdlin, linjinming@jd.com
* Company: JD.com, Inc.
*
* =====================================================================================
*/
#ifndef __SPLIT_MANAGER_H__
#define __SPLIT_MANAGER_H__
#include <string>
#include <sys/types.h>
#include <vector>
#include "index_conf.h"
#include "split_tool.h"
using namespace std;
#define NUMBER_ID 500000000
#define MAXNUMBER 100000000
struct table_info{
int is_primary_key;
int field_type;
int index_tag;
int snapshot_tag;
int field_value;
int segment_tag;
int segment_feature;
string index_info;
};
class SplitManager {
public:
SplitManager();
~SplitManager();
static SplitManager *Instance(){
return CSingleton<SplitManager>::Instance();
}
static void Destroy(){
CSingleton<SplitManager>::Destroy();
}
bool Init(const SGlobalIndexConfig &global_cfg);
void DeInit();
vector<vector<string> > split(string str,uint32_t appid);
vector<string> split(string str);
bool wordValid(string word, uint32_t appid, uint32_t &id);
bool GetWordInfo(string word, uint32_t appid, WordInfo &word_info) {
return seg.GetWordInfoFromDictOnly(word, appid, word_info);
}
struct table_info *get_table_info(uint32_t appid, string filed_name);
bool is_effective_appid(uint32_t appid);
bool GetCharactId(string charact, uint32_t &id);
bool GetPhoneticId(string phonetic, uint32_t &id);
vector<string> GetPhonetic(string charact);
bool getHanpinField(uint32_t appid, vector<uint32_t> & field_vec);
bool getUnionKeyField(uint32_t appid, vector<string> & field_vec);
private:
bool fetch_tbinfo_from_mysql_to_map();
FBSegment seg;
set<string> stop_word_set;
map<string, u_int32_t> word_map;
map<uint32_t,map<string,table_info> > tableDefine;
string split_mode;
map<string, uint32_t> charact_map;
map<string, uint32_t> phonetic_map;
multimap<string, string> charact_phonetic_map;
};
string trim(string& str);
string delPrefix(string& str);
#endif

View File

@ -0,0 +1,12 @@
#!/bin/sh
cd $(dirname $0)
proname="index_write"
if [ -f $proname ] ; then
chmod 755 $proname
./$proname
else
echo "no program"
fi

View File

@ -0,0 +1,14 @@
#!/bin/sh
root=$(cd $(dirname $0); pwd)
pid_file=$root/index_write.pid
if [ -f $pid_file ] ; then
pid=`cat $pid_file`
kill $pid
/bin/rm -f $pid_file
else
echo "No pid file."
fi

View File

@ -0,0 +1,466 @@
/*
* =====================================================================================
*
* Filename: top_index_service.cc
*
* Description: class definition.
*
* Version: 1.0
* Created: 09/08/2020 10:02:05 PM
* Revision: none
* Compiler: gcc
*
* Author: shrewdlin, linjinming@jd.com
* Company: JD.com, Inc.
*
* =====================================================================================
*/
#include "top_index_service.h"
#include <iostream>
#include <string>
#include <map>
#include "log.h"
#include "poll_thread.h"
#include "task_request.h"
#include "dtc_tools.h"
#include "comm.h"
#include "index_clipping.h"
#include "monitor.h"
#include "chash.h"
CTaskTopIndex::CTaskTopIndex(CPollThread * o) :
CTaskDispatcher<CTaskRequest>(o),
ownerThread(o),
output(o)
{
}
CTaskTopIndex::~CTaskTopIndex()
{
}
int CTaskTopIndex::insert_snapshot_dtc(const UserTableContent &fields,int &doc_version,Json::Value &res){
int ret;
DTC::Server* dtc_server = index_servers.GetServer();
if(NULL == dtc_server){
log_error("snapshot server connect error!");
return RT_ERROR_GET_SNAPSHOT;
}
DTC::InsertRequest insertReq(dtc_server);
insertReq.SetKey(gen_dtc_key_string(fields.appid, "11", fields.doc_id).c_str());
insertReq.Set("doc_id", fields.doc_id.c_str());
insertReq.Set("doc_version", doc_version);
insertReq.Set("created_time", fields.publish_time);
insertReq.Set("field", fields.top);
insertReq.Set("word_freq", 0);
insertReq.Set("weight", 0);
insertReq.Set("location", "");
insertReq.Set("start_time", 0);
insertReq.Set("end_time", 0);
insertReq.Set("extend", fields.content.c_str());
DTC::Result rst;
ret = insertReq.Execute(rst);
if (ret != 0)
{
log_error("insert request error! ,errno %d ,errmsg %s, errfrom %s\n", ret,rst.ErrorMessage(), rst.ErrorFrom());
return -1;
}
return 0;
}
int CTaskTopIndex::delete_snapshot_dtc(const string &doc_id, uint32_t appid, Json::Value &res) {
int ret;
DTC::Server* dtc_server = index_servers.GetServer();
if (NULL == dtc_server) {
log_error("snapshot server connect error!");
return RT_ERROR_GET_SNAPSHOT;
}
DTC::DeleteRequest deleteReq(dtc_server);
ret = deleteReq.SetKey(gen_dtc_key_string(appid, "11", doc_id).c_str());
ret = deleteReq.EQ("doc_id", doc_id.c_str());
DTC::Result rst;
ret = deleteReq.Execute(rst);
if (ret != 0)
{
log_error("delete request error! ,errno %d ,errmsg %s, errfrom %s\n", ret, rst.ErrorMessage(), rst.ErrorFrom());
return RT_ERROR_DELETE_SNAPSHOT;
}
return 0;
}
static int get_snapshot_execute(DTC::Server* dtc_server,const UserTableContent &fields,DTC::Result &rst){
DTC::GetRequest getReq(dtc_server);
int ret = 0;
ret = getReq.SetKey(gen_dtc_key_string(fields.appid, "11", fields.doc_id).c_str());
ret = getReq.Need("doc_version");
ret = getReq.Execute(rst);
return ret;
}
int CTaskTopIndex::get_snapshot_active_doc(const UserTableContent &fields,int &doc_version,Json::Value &res){
int ret;
DTC::Server* dtc_server = index_servers.GetServer();
if(NULL == dtc_server){
log_error("snapshot server connect error!");
return RT_ERROR_GET_SNAPSHOT;
}
DTC::Result rst;
ret = get_snapshot_execute(dtc_server,fields,rst);
if (ret != 0) {
if (ret == -110) {
rst.Reset();
ret = get_snapshot_execute(dtc_server,fields,rst);
if (ret != 0) {
log_error("get request error! errcode %d,errmsg %s, errfrom %s", ret, rst.ErrorMessage(), rst.ErrorFrom());
return RT_ERROR_GET_SNAPSHOT;
}
}
else {
log_error("get request error! errcode %d,errmsg %s, errfrom %s", ret, rst.ErrorMessage(), rst.ErrorFrom());
return RT_ERROR_GET_SNAPSHOT;
}
}
int cnt = rst.NumRows();
struct index_item item;
if (rst.NumRows() <= 0) {
return RT_NO_THIS_DOC;
}
else {
for (int i = 0; i < cnt; i++) {
rst.FetchRow();
doc_version = rst.IntValue("doc_version");
}
}
return 0;
}
static int decode_request(const Json::Value &req, Json::Value &subreq, int &id, int &count){
if(req.isMember("table_content") && req["table_content"].isArray()){
subreq = req["table_content"];
}else{
return RT_NO_TABLE_CONTENT;
}
if(req.isMember("appid") && req["appid"].isInt()){
id = req["appid"].asInt();
}else{
return RT_NO_APPID;
}
if(req.isMember("fields_count") && req["fields_count"].isInt()){
count = req["fields_count"].asInt();
}else{
return RT_NO_FIELD_COUNT;
}
return 0;
}
static int decode_field(Json::Value table_content,Json::Value &json_fields,UserTableContent &fields){
string cmd;
time_t now = time(NULL);
if(table_content.isMember("cmd") && table_content["cmd"].isString()){
cmd = table_content["cmd"].asString();
if(cmd == "top_add" || cmd == "top_update"){
fields.top = 1;
if(table_content.isMember("fields") && table_content["fields"].isObject()){
json_fields = table_content["fields"];
if(json_fields.isMember("id") && json_fields["id"].isString()){
fields.doc_id = json_fields["id"].asString();
}else{
if(json_fields.isMember("doc_id") && json_fields["doc_id"].isString()){
fields.doc_id = json_fields["doc_id"].asString();
}else
return RT_NO_DOCID;
}
if(json_fields.isMember("sp_words") && json_fields["sp_words"].isString()){
fields.sp_words = json_fields["sp_words"].asString();
fields.description = json_fields["sp_words"].asString();//description is using as sp_words section;
}
if(json_fields.isMember("weight") && json_fields["weight"].isInt()){
fields.weight = json_fields["weight"].asInt();
}else{
fields.weight = 1;
}
if(json_fields.isMember("publish_time") && json_fields["publish_time"].isInt()){
fields.publish_time = json_fields["publish_time"].asInt();
}else
fields.publish_time = now;
if(json_fields.isMember("top_start_time") && json_fields["top_start_time"].isInt()){
fields.top_start_time = json_fields["top_start_time"].asInt();
}else{
fields.top_start_time = now;
}
if(json_fields.isMember("top_end_time") && json_fields["top_end_time"].isInt()){
if(json_fields["top_end_time"].asInt() < fields.top_start_time)
fields.top_end_time = fields.top_start_time;
fields.top_end_time = json_fields["top_end_time"].asInt();
}else{
fields.top_end_time = fields.top_start_time + (24*60*60);
}
return RT_CMD_ADD;
}
}else if(cmd == "top_delete"){
fields.top = 1;
Json::Value field = table_content["fields"];
if(field.isMember("doc_id") && field["doc_id"].isString()){
fields.doc_id = field["doc_id"].asString();
return RT_CMD_DELETE;
}
}else{
return RT_ERROR_FIELD_CMD;
}
}
return 0;
}
int CTaskTopIndex::pre_process(void){
DTCTools *dtc_tools = DTCTools::Instance();
dtc_tools->init_servers(index_servers, IndexConf::Instance()->GetDTCIndexConfig());
return 0;
}
int CTaskTopIndex::do_split_sp_words(string &str, string &doc_id, uint32_t appid, set<string> &word_set,Json::Value &res) {
string word;
uint32_t id = 0;
vector<string> strs = splitEx(str, "|");
vector<string>::iterator iter = strs.begin();
uint32_t index = 0;
for (; iter != strs.end(); iter++) {
index++;
word = *iter;
if (!SplitManager::Instance()->wordValid(word, appid, id)){
log_error("invalued sp_word!%s",word.c_str());
return RT_ERROR_INVALID_SP_WORD;
}
word_set.insert(word);
}
return 0;
}
static int insert_top_index_execute(DTC::Server* dtcServer,string key,const UserTableContent &fields,int doc_version,DTC::Result &rst){
int ret = 0;
DTC::InsertRequest insertReq(dtcServer);
insertReq.SetKey(key.c_str());
insertReq.Set("doc_id", fields.doc_id.c_str());
insertReq.Set("doc_version",doc_version);
insertReq.Set("created_time",time(NULL));
insertReq.Set("start_time",fields.top_start_time);
insertReq.Set("end_time",fields.top_end_time);
insertReq.Set("weight", fields.weight);
insertReq.Set("extend","");
ret = insertReq.Execute(rst);
return ret;
}
int CTaskTopIndex::insert_top_index_dtc(string key,const UserTableContent &fields,int doc_version,Json::Value &res){
int ret = 0;
DTC::Server* dtcServer = index_servers.GetServer();
if(dtcServer == NULL){
log_error("GetServer error");
return -1;
}
char tmp[41] = { '0' };
snprintf(tmp, sizeof(tmp), "%40s", fields.doc_id.c_str());
dtcServer->SetAccessKey(tmp);
DTC::Result rst;
ret = insert_top_index_execute(dtcServer,key,fields,doc_version,rst);
if (ret != 0)
{
log_error("insert request error! ,errno %d ,errmsg %s, errfrom %s\n", ret,rst.ErrorMessage(), rst.ErrorFrom());
res[MESSAGE] = rst.ErrorMessage();
return -1;
}
log_debug("insert word:%s sp_word:%s doc_id:%s doc_version: %d to top index!",key.c_str(),fields.sp_words.c_str(),fields.doc_id.c_str(),doc_version);
return 0;
}
int CTaskTopIndex::do_insert_top_index(const UserTableContent &fields,int doc_version, set<string> &word_set,Json::Value &res) {
int ret;
set<string>::iterator iter = word_set.begin();
for (; iter != word_set.end(); iter++) {
string key = gen_dtc_key_string(fields.appid, "01", *iter);
ret = insert_top_index_dtc(key,fields,doc_version,res);
if(ret < 0)
return RT_ERROR_INSERT_TOP_INDEX_DTC;
}
return 0;
}
int CTaskTopIndex::update_sanpshot_dtc(const UserTableContent &fields,int doc_version,Json::Value &res){
int ret = 0;
DTC::Server* dtc_server = index_servers.GetServer();
if(NULL == dtc_server){
log_error("snapshot server connect error!");
return RT_ERROR_GET_SNAPSHOT;
}
DTC::UpdateRequest updateReq(dtc_server);
ret = updateReq.SetKey(gen_dtc_key_string(fields.appid, "11", fields.doc_id).c_str());
updateReq.Set("doc_version", doc_version);
if(fields.content != "null\n")
updateReq.Set("extend", fields.content.c_str());
updateReq.Set("created_time",fields.publish_time);
DTC::Result rst;
ret = updateReq.Execute(rst);
if (ret != 0)
{
log_error("updateReq error! ,errno %d ,errmsg %s, errfrom %s\n", ret,rst.ErrorMessage(), rst.ErrorFrom());
return RT_ERROR_UPDATE_SNAPSHOT;
}
return ret;
}
int CTaskTopIndex::top_index_process(Json::Value &req,Json::Value &res){
string split_content;
string split_title;
int doc_version = 0,old_version = 0;
int app_id,fields_count = 0,ret = 0;
Json::Value table_content;
set<string> word_set;
ret = decode_request(req, table_content, app_id,fields_count);
if(ret != 0){
return ret;
}
if(fields_count == 0 || fields_count != (int)table_content.size()){
return RT_ERROR_FIELD_COUNT;
}
if(!SplitManager::Instance()->is_effective_appid(app_id)){
return RT_NO_APPID;
}
for(int i = 0;i < (int)table_content.size();i++){
doc_version = 0; old_version = 0;
UserTableContent content_fields(app_id);
Json::Value json_field;
ret = decode_field(table_content[i],json_field,content_fields);
if(RT_CMD_ADD == ret){
ret = get_snapshot_active_doc(content_fields,old_version,res);
if(0 == ret){
doc_version = ++old_version;
}else if(ret != RT_NO_THIS_DOC) return ret;
Json::Value::Members member = json_field.getMemberNames();
Json::Value snapshot_content;
string lng = "",lat = "";
for(Json::Value::Members::iterator iter = member.begin(); iter != member.end(); ++iter)
{
string field_name = *iter;
struct table_info *tbinfo = NULL;
tbinfo = SplitManager::Instance()->get_table_info(app_id,field_name);
if(tbinfo == NULL){
continue;
}
if(tbinfo->snapshot_tag == 1){//snapshot
if(tbinfo->field_type == 1 && json_field[field_name].isInt()){
snapshot_content[field_name] = json_field[field_name].asInt();
}else if(tbinfo->field_type > 1 && json_field[field_name].isString()){
snapshot_content[field_name] = json_field[field_name].asString();
}
}
}
log_debug("sp_words:%s\n",content_fields.sp_words.c_str());
ret = do_split_sp_words(content_fields.sp_words,content_fields.doc_id,content_fields.appid,word_set,res);
if(0 != ret){
res[MESSAGE] = "do_split_sp_words error";
return ret;
}
ret = do_insert_top_index(content_fields,doc_version,word_set,res);
if( 0!= ret){
return ret;
}
Json::FastWriter writer;
content_fields.content = writer.write(snapshot_content);
if(doc_version != 0){//need update
update_sanpshot_dtc(content_fields,doc_version,res);
}else{
insert_snapshot_dtc(content_fields,doc_version,res);//insert the snapshot doc
}
word_set.clear();
}
else if(RT_CMD_DELETE == ret){
ret = delete_snapshot_dtc(content_fields.doc_id,content_fields.appid,res);//not use the doc_version curr
}
}
return ret;
}
void CTaskTopIndex::TaskNotify(CTaskRequest * curr)
{
log_debug("CTaskTopIndex::TaskNotify start");
common::CallerInfo caller_info = common::ProfilerMonitor::GetInstance().RegisterInfo(std::string("searchEngine.searchService.topIndexTask"));
//there is a race condition here:
//curr may be deleted during process (in task->ReplyNotify())
int ret;
Json::Reader reader;
Json::FastWriter writer;
Json::Value value, res;
std::string req;
res["code"] = 0;
CTaskRequest * task = curr;
if(NULL == curr){
common::ProfilerMonitor::GetInstance().FunctionError(caller_info);
common::ProfilerMonitor::GetInstance().RegisterInfoEnd(caller_info);
return;
}
if(SERVICE_TOPINDEX != task->GetReqCmd()){
res["code"] = RT_ERROR_SERVICE_TYPE;
res["reqcmd"] = task->GetReqCmd();
res["message"] = "service type wrong! need 107";
goto end;
}
req = task->buildRequsetString();
log_debug("recv:%s\n",req.c_str());
if(!reader.parse(req,value,false))
{
log_error("parse json error!\ndata:%s errors:%s\n",req.c_str(),reader.getFormattedErrorMessages().c_str());
res["code"] = RT_PARSE_JSON_ERR;
res["message"] = reader.getFormattedErrorMessages();
res["data"] = req;
goto end;
}
if(!value.isObject()){
log_error("parse json error!\ndata:%s errors:%s\n",req.c_str(),reader.getFormattedErrorMessages().c_str());
res["code"] = RT_PARSE_JSON_ERR;
res["message"] = "it's not a json";
res["data"] = req;
goto end;
}
ret = top_index_process(value,res);
if(0 != ret){
res["code"] = ret;
}
end:
task->setResult(writer.write(res));
task->ReplyNotify();
common::ProfilerMonitor::GetInstance().RegisterInfoEnd(caller_info);
return;
}

View File

@ -0,0 +1,67 @@
/*
* =====================================================================================
*
* Filename: top_index_service.h
*
* Description: class definition.
*
* Version: 1.0
* Created: 09/08/2020 10:02:05 PM
* Revision: none
* Compiler: gcc
*
* Author: shrewdlin, linjinming@jd.com
* Company: JD.com, Inc.
*
* =====================================================================================
*/
#ifndef TOP_INDEX_SERVICE_H_
#define TOP_INDEX_SERVICE_H_
#include <set>
#include <vector>
#include <sstream>
#include "request_base.h"
#include "index_conf.h"
#include "dtcapi.h"
#include "split_manager.h"
using namespace std;
class CPollThread;
class CTaskRequest;
class SplitManager;
class DTCServers;
class CTaskTopIndex : public CTaskDispatcher<CTaskRequest>
{
private:
CPollThread * ownerThread;
CRequestOutput<CTaskRequest> output;
DTC::DTCServers index_servers;
private:
int insert_top_index_dtc(string key,const UserTableContent &fields,int doc_version,Json::Value &res);
int do_insert_top_index(const UserTableContent &fields,int doc_version, set<string> &word_set,Json::Value &res);
int get_snapshot_active_doc(const UserTableContent &fields,int &doc_version,Json::Value &res);
int delete_snapshot_dtc(const string &doc_id,uint32_t appid,Json::Value &res);
int insert_snapshot_dtc(const UserTableContent &fields,int &doc_version,Json::Value &res);
int do_split_sp_words(string &str, string &doc_id, uint32_t appid, set<string> &word_set,Json::Value &res);
int update_sanpshot_dtc(const UserTableContent &fields,int doc_version,Json::Value &res);
public:
CTaskTopIndex(CPollThread * o);
virtual ~CTaskTopIndex();
int pre_process(void);
int top_index_process(Json::Value &req,Json::Value &res);
inline void BindDispatcher(CTaskDispatcher<CTaskRequest> *p)
{
output.BindDispatcher(p);
}
virtual void TaskNotify(CTaskRequest * curr);
};
#endif /* TOP_INDEX_SERVICE_H_ */