#include "stdafx.h"
#include "global.h"
#include "funs.h"
#include "urlquery.h"
#include "htmlParser.h"
#include "funs.h"
STDMETHODIMP_(ULONG) ThtmlParser::AddRef() { return ++m_ref; }
STDMETHODIMP_(ULONG) ThtmlParser::Release() {
if (--m_ref == 0) {
delete this;
return 0;
}
return m_ref;
}
STDMETHODIMP ThtmlParser:

ueryInterface(REFIID riid, void** ppv)
{
*ppv = NULL;
if (IID_IUnknown == riid || IID_IPropertyNotifySink == riid)
*ppv = (LPUNKNOWN) (IPropertyNotifySink*) this;
else if (IID_IOleClientSite == riid)
*ppv = (IOleClientSite*)this;
else if (IID_IDispatch == riid)
*ppv = (IDispatch*)this;
else return E_NOTIMPL;
AddRef();
return NOERROR;
}
STDMETHODIMP ThtmlParser::OnChanged (DISPID dispID)
{
if (DISPID_READYSTATE == dispID)
{
HRESULT hr;
VARIANT varResult = {0};
DISPPARAMS dp = {NULL, NULL, 0, 0};
hr = m_MSHTML->Invoke(
DISPID_READYSTATE, IID_NULL, LOCALE_SYSTEM_DEFAULT, DISPATCH_PROPERTYGET,
&dp, &varResult, NULL, NULL);
if (SUCCEEDED(hr))
{
READYSTATE lReadyState;
lReadyState = (READYSTATE)V_I4(&varResult);
m_readyState = lReadyState;
if (m_readyState == READYSTATE_COMPLETE) {
DWORD dwId = GetCurrentThreadId();
while (!PostThreadMessage(dwId, WM_APP_PARSER_READY, 0, 0))
Sleep (rand() % 300);
}
VariantClear(&varResult);
}
}
return NOERROR;
}
STDMETHODIMP ThtmlParser::Invoke(
DISPID dispIdMember, REFIID riid, LCID lcid, WORD wFlags,
DISPPARAMS __RPC_FAR *pDispParams, VARIANT __RPC_FAR *pVarResult,
EXCEPINFO __RPC_FAR *pExcepInfo, UINT __RPC_FAR *puArgErr)
{
if (!pVarResult) return E_POINTER;
switch(dispIdMember)
{
case DISPID_AMBIENT_DLCONTROL:
V_VT(pVarResult) = VT_I4;
V_I4(pVarResult) =
DLCTL_DOWNLOADONLY | DLCTL_NO_SCRIPTS | DLCTL_NO_JAVA |
DLCTL_NO_DLACTIVEXCTLS | DLCTL_NO_RUNACTIVEXCTLS | DLCTL_NO_CLIENTPULL | DLCTL_NO_FRAMEDOWNLOAD;
break;
case DISPID_AMBIENT_USERMODE:
V_VT(pVarResult) = VT_BOOL;
V_BOOL(pVarResult) = VARIANT_FALSE;
break;
default:
return DISP_E_MEMBERNOTFOUND;
}
return NOERROR;
}
ThtmlParser::ThtmlParser(const char* baseurl, const char* html, bool bfollow)
{
strcpy(m_baseurl, baseurl);
strcpy(m_html, html);
m_bfollow = bfollow;
m_ref = 1;
m_MSHTML = NULL;
m_cp = NULL;
m_cookie = 0;
m_readyState = READYSTATE_UNINITIALIZED;
m_coll = NULL;
m_cElem = m_iElem = 0;
m_connected = CONNECT_E_CANNOTCONNECT;
LPCONNECTIONPOINTCONTAINER pCPC = NULL;
LPOLEOBJECT pOB = NULL;
LPOLECONTROL pOC = NULL;
HRESULT hr = CoCreateInstance(CLSID_HTMLDocument, NULL,
CLSCTX_INPROC_SERVER, IID_IHTMLDocument2, (void **)&m_MSHTML);
if (m_MSHTML)
{
hr = m_MSHTML->QueryInterface(IID_IOleObject, (void**)&pOB);
if (pOB) {
hr = pOB->SetClientSite((IOleClientSite*)this);
pOB->Release();
}
hr = m_MSHTML->QueryInterface(IID_IOleControl, (void**)&pOC);
if (pOC) {
hr = pOC->OnAmbientPropertyChange(DISPID_AMBIENT_DLCONTROL);
pOC->Release();
}
hr = m_MSHTML->QueryInterface(IID_IConnectionPointContainer, (void**)&pCPC);
if (pCPC)
hr = pCPC->FindConnectionPoint(IID_IPropertyNotifySink, &m_cp);
if (m_cp)
m_connected = m_cp->Advise((LPUNKNOWN) (IPropertyNotifySink*) this, &m_cookie);
if (pCPC)
pCPC->Release();
}
}
ThtmlParser::~ThtmlParser()
{
if (m_coll) m_coll->Release();
if (m_cp) {
if (SUCCEEDED(m_connected)) m_cp->Unadvise(m_cookie);
m_cp->Release();
}
if (m_MSHTML)
m_MSHTML->Release();
}
HRESULT ThtmlParser::load()
{
HRESULT hr;
LPPERSISTFILE pPersistFile = NULL;
hr = m_MSHTML->QueryInterface(IID_IPersistFile, (void **)&pPersistFile);
if (!pPersistFile)
return hr;
WCHAR w_file [MAX_PATH] = {0};
MultiByteToWideChar(CP_THREAD_ACP, 0, m_html, strlen(m_html), w_file, sizeof(w_file));
hr = pPersistFile->Load(w_file, 0);
pPersistFile->Release();
return hr;
}
long ThtmlParser::get_nextLink(char* url)
{
if (!m_coll)
{
m_MSHTML->get_all(&m_coll);
if (!m_coll)
return ELEM_TERM;
m_coll->get_length(&m_cElem);
m_iElem = 0;
}
while (1)
{
if (m_iElem >= m_cElem)
return ELEM_TERM;
VARIANT index, nil = {0};
index.vt = VT_UINT;
index.lVal = m_iElem;
HRESULT hr;
IDispatch* disp = NULL;
hr = m_coll->item(index, nil, &disp);
if (!disp) return ELEM_TERM;
IHTMLAnchorElement *anchor = NULL;
IHTMLImgElement *img = NULL;
IHTMLFrameBase *frame = NULL;
IHTMLMetaElement *refresh= NULL;
hr = disp->QueryInterface(IID_IHTMLImgElement, (void **)&img);
hr = disp->QueryInterface(IID_IHTMLFrameBase, (void **)&frame);
hr = disp->QueryInterface(IID_IHTMLMetaElement, (void **)&refresh);
if (m_bfollow) hr = disp->QueryInterface(IID_IHTMLAnchorElement, (void **)&anchor);
if (refresh)
{
bool b = get_refreshLink(refresh, url);
refresh->Release();
disp->Release();
m_iElem++;
if (b) return ELEM_REFRESH;
}
else if (anchor)
{
get_anchorLink(anchor, url);
anchor->Release();
disp->Release();
m_iElem++;
return ELEM_ANCHOR;
}
else if (img)
{
get_imgLink(img, url);
img->Release();
disp->Release();
m_iElem++;
return ELEM_IMG;
}
else if (frame)
{
get_frameLink(frame, url);
frame->Release();
disp->Release();
m_iElem++;
return ELEM_FRAME;
}
else
{
m_iElem++;
disp->Release();
}
}//while(1)
}
void ThtmlParser::get_frameLink(IHTMLFrameBase* frame, char* url)
{
char src [MAX_URL] = {0};
BSTR bstr = NULL;
frame->get_src(&bstr);
if (bstr) WideCharToMultiByte(0, 0, bstr, wcslen(bstr), src, MAX_URL, NULL, NULL);
if (bstr) FREE_BSTR(bstr);
if (get_urlscheme(src) != INTERNET_SCHEME_FILE) {
strcpy(url, src);
return;
}
fileurl_to_weburl(src, url);
_strlwr(url);
remove_urlfragment(url);
remove_urlslash(url);
sort_urlquery(url);
}
void ThtmlParser::get_anchorLink(IHTMLAnchorElement* anchor, char* url)
{
char href [MAX_URL] = {0};
BSTR bstr =NULL;
anchor->get_href(&bstr);
if (bstr) WideCharToMultiByte(0, 0, bstr, wcslen(bstr), href, MAX_URL, NULL, NULL);
if (bstr) FREE_BSTR(bstr);
if (get_urlscheme(href) != INTERNET_SCHEME_FILE) {
strcpy(url, href);
return;
}
fileurl_to_weburl(href, url);
_strlwr(url);
remove_urlfragment(url);
remove_urlslash(url);
sort_urlquery(url);
}
void ThtmlParser::get_imgLink(IHTMLImgElement* img, char* url)
{
char href [MAX_URL] = {0};
BSTR bstr = NULL;
img->get_href(&bstr);
if (bstr) WideCharToMultiByte(0, 0, bstr, wcslen(bstr), href, MAX_URL, NULL, NULL);
if (bstr) FREE_BSTR(bstr);
if (get_urlscheme(href) != INTERNET_SCHEME_FILE) {
strcpy(url, href);
return;
}
fileurl_to_weburl(href, url);
_strlwr(url);
remove_urlfragment(url);
remove_urlslash(url);
sort_urlquery(url);
}
bool ThtmlParser::get_refreshLink(IHTMLMetaElement* refresh, char* url)
{
char meta [MAX_URL] = {0};
char href [MAX_URL] = {0};
BSTR bstr = NULL;
refresh->get_httpEquiv(&bstr);
if (bstr) WideCharToMultiByte(0, 0, bstr, wcslen(bstr), meta, MAX_URL, NULL, NULL);
if (bstr) FREE_BSTR(bstr);
if (stricmp(meta, "refresh") != 0) return false;
refresh->get_url(&bstr);
if (bstr) WideCharToMultiByte(0, 0, bstr, wcslen(bstr), href, MAX_URL, NULL, NULL);
if (bstr) FREE_BSTR(bstr);
if (get_urlscheme(href) != INTERNET_SCHEME_FILE) {
strcpy(url, href);
return true;
}
fileurl_to_weburl(href, url);
_strlwr(url);
remove_urlfragment(url);
remove_urlslash(url);
sort_urlquery(url);
return true;
}
void ThtmlParser::fileurl_to_weburl(const char* fileurl, char* weburl)
{
char fileurlPath [MAX_PATH] = {0};
get_urlpath(fileurl, fileurlPath);
char relurl [MAX_URL] = {0};
PathRelativePathTo(relurl, m_html, FILE_ATTRIBUTE_NORMAL, fileurlPath, FILE_ATTRIBUTE_NORMAL);
char* query = strstr(fileurlPath, "?");
if (query) strcat(relurl, query);
DWORD len = MAX_URL;
InternetCombineUrl(m_baseurl, relurl, weburl, &len, 0);
}