1 /*
2 * Copyright (c) 1995-2002 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of the GNU General Public License as published by the
6 * Free Software Foundation; either version 2 of the License, or (at your
7 * option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
11 * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * for more details.
13 */
14
15 #include "pmapi.h"
16 #include "impl.h"
17 #include "pmcd.h"
18
19 extern int _pmSelectReadable(int, fd_set *);
20
21 /* Routine to split a result into a list of results, each containing metrics
22 * from a single domain. The end of the list is marked by a pmResult with a
23 * numpmid of zero. Any pmids for which there is no agent will be in the
24 * second to last pmResult which will have a negated numpmid value.
25 */
26
27 pmResult **
28 SplitResult(pmResult *res)
29 {
30 int i, j;
31 static int *aFreq = NULL; /* Freq. histogram: pmids for each agent */
32 static int *resIndex = NULL; /* resIndex[k] = index of agent[k]'s list in result */
33 static int nDoms = 0; /* No. of entries in two tables above */
34 int nGood;
35 int need;
36 pmResult **results;
37
38 /* Allocate the frequency histogram and array for mapping from agent to
39 * result list index. Because a SIGHUP reconfiguration may have caused a
40 * change in the number of agents, reallocation using a new size may be
41 * necessary.
42 * There are nAgents + 1 entries in the aFreq and resIndex arrays. The
43 * last entry in each is used for the pmIDs for which no agent could be
44 * found.
45 */
46 if (nAgents > nDoms) {
47 nDoms = nAgents;
48 if (aFreq != NULL)
49 free(aFreq);
50 if (resIndex != NULL)
51 free(resIndex);
52 aFreq = (int *)malloc((nAgents + 1) * sizeof(int));
53 resIndex = (int *)malloc((nAgents + 1) * sizeof(int));
54 if (aFreq == NULL || resIndex == NULL) {
55 __pmNoMem("SplitResult.freq", 2 * (nAgents + 1) * sizeof(int), PM_FATAL_ERR);
56 }
57 }
58
59 /* Build a frequency histogram of metric domains (use aFreq[nAgents] for
60 * pmids for which there is no agent).
61 */
62 for (i = 0; i <= nAgents; i++)
63 aFreq[i] = 0;
64 for (i = 0; i < res->numpmid; i++) {
65 int dom = ((__pmID_int *)&res->vset[i]->pmid)->domain;
66 for (j = 0; j < nAgents; j++)
67 if (agent[j].pmDomainId == dom && agent[j].status.connected)
68 break;
69 aFreq[j]++;
70 }
71
72 /* Initialise resIndex and allocate the results structures */
73 nGood = 0;
74 for (i = 0; i < nAgents; i++)
75 if (aFreq[i]) {
76 resIndex[i] = nGood;
77 nGood++;
78 }
79 resIndex[nAgents] = nGood;
80
81 need = nGood + 1 + ((aFreq[nAgents]) ? 1 : 0);
82 need *= sizeof(pmResult *);
83 if ((results = (pmResult **) malloc(need)) == NULL) {
84 __pmNoMem("SplitResult.results", need, PM_FATAL_ERR);
85 }
86 j = 0;
87 for (i = 0; i <= nAgents; i++)
88 if (aFreq[i]) {
89 need = (int)sizeof(pmResult) + (aFreq[i] - 1) * (int)sizeof(pmValueSet *);
90 results[j] = (pmResult *) malloc(need);
91 if (results[j] == NULL) {
92 __pmNoMem("SplitResult.domain", need, PM_FATAL_ERR);
93 }
94 results[j]->numpmid = aFreq[i];
95 j++;
96 }
97
98 /* Make the "end of list" pmResult */
99 if ((results[j] = (pmResult *) malloc(sizeof(pmResult))) == NULL) {
100 __pmNoMem("SplitResult.domain", sizeof(pmResult), PM_FATAL_ERR);
101 }
102 results[j]->numpmid = 0;
103
104 /* Foreach vset in res, find it's pmResult in the per domain results array
105 * and copy a pointer to the vset to the next available position in the per
106 * domain result.
107 */
108 for (i = 0; i <= nAgents; i++)
109 aFreq[i] = 0;
110 for (i = 0; i < res->numpmid; i++) {
111 int dom = ((__pmID_int *)&res->vset[i]->pmid)->domain;
112 for (j = 0; j < nAgents; j++)
113 if (dom == agent[j].pmDomainId && agent[j].status.connected)
114 break;
115 results[resIndex[j]]->vset[aFreq[j]] = res->vset[i];
116 aFreq[j]++;
117 }
118
119 /* Flip the sign of numpmids in the "bad list" */
120 if (aFreq[nAgents]) {
121 int bad = resIndex[nAgents];
122 results[bad]->numpmid = -results[bad]->numpmid;
123 }
124
125 return results;
126 }
127
128 int
129 DoStore(ClientInfo *cp, __pmPDU* pb)
130 {
131 int sts;
132 int s;
133 AgentInfo *ap;
134 pmResult *result;
135 pmResult **dResult;
136 int i;
137 fd_set readyFds;
138 fd_set waitFds;
139 int nWait = 0;
140 int maxFd = -1;
141 int badStore; /* != 0 => store to nonexistent agent */
142 int notReady = 0; /* != 0 => store to agent that's not ready */
143 struct timeval timeout;
144
145
146 if ((sts = __pmDecodeResult(pb, &result)) < 0)
147 return sts;
148
149 dResult = SplitResult(result);
150
151 /* Send the per-domain results to their respective agents */
152
153 FD_ZERO(&waitFds);
154 for (i = 0; dResult[i]->numpmid > 0; i++) {
155 int fd;
156 ap = FindDomainAgent(((__pmID_int *)&dResult[i]->vset[0]->pmid)->domain);
157 /* If it's in a "good" list, pmID has agent that is connected */
158
159 if (ap->ipcType == AGENT_DSO) {
160 if (ap->ipc.dso.dispatch.comm.pmda_interface >= PMDA_INTERFACE_5)
161 ap->ipc.dso.dispatch.version.four.ext->e_context = cp - client;
162 if (ap->ipc.dso.dispatch.comm.pmda_interface >= PMDA_INTERFACE_4)
163 s = ap->ipc.dso.dispatch.version.four.store(dResult[i],
164 ap->ipc.dso.dispatch.version.four.ext);
165 else if (ap->ipc.dso.dispatch.comm.pmda_interface == PMDA_INTERFACE_2 ||
166 ap->ipc.dso.dispatch.comm.pmda_interface == PMDA_INTERFACE_3)
167 s = ap->ipc.dso.dispatch.version.two.store(dResult[i],
168 ap->ipc.dso.dispatch.version.two.ext);
169 else
170 s = ap->ipc.dso.dispatch.version.one.store(dResult[i]);
171 if (s < 0 &&
172 ap->ipc.dso.dispatch.comm.pmapi_version == PMAPI_VERSION_1)
173 s = XLATE_ERR_1TO2(s);
174 }
175 else {
176 if (ap->status.notReady == 0) {
177 /* agent is ready for PDUs */
178 if (_pmcd_trace_mask)
179 pmcd_trace(TR_XMIT_PDU, ap->inFd, PDU_RESULT, dResult[i]->numpmid);
180 s = __pmSendResult(ap->inFd, cp - client, dResult[i]);
181 if (s >= 0) {
182 ap->status.busy = 1;
183 fd = ap->outFd;
184 FD_SET(fd, &waitFds);
185 if (fd > maxFd)
186 maxFd = fd;
187 nWait++;
188 }
189 else if (s == PM_ERR_IPC || sts == PM_ERR_TIMEOUT || s == -EPIPE) {
190 pmcd_trace(TR_XMIT_ERR, ap->inFd, PDU_RESULT, sts);
191 CleanupAgent(ap, AT_COMM, ap->inFd);
192 }
193 }
194 else
195 /* agent is not ready for PDUs */
196 notReady = 1;
197 }
198 if (s < 0) {
199 sts = s;
200 continue;
201 }
202 }
203
204 /* If there was no agent for one or more pmIDs, there will be a "bad list"
205 * with a negated numpmid value. Store as many "good" pmIDs as possible
206 * but remember that there were homeless ones.
207 */
208 badStore = dResult[i]->numpmid < 0;
209
210 /* Collect error PDUs containing store status from each active agent */
211
212 while (nWait > 0) {
213 memcpy(&readyFds, &waitFds, sizeof(readyFds));
214 if (nWait > 1) {
215 timeout.tv_sec = _pmcd_timeout;
216 timeout.tv_usec = 0;
217
218 s = select(maxFd+1, &readyFds, NULL, NULL, &timeout);
219
220 if (s == 0) {
221 __pmNotifyErr(LOG_INFO, "DoStore: select timeout");
222
223 /* Timeout, terminate agents that haven't responded */
224 for (i = 0; i < nAgents; i++) {
225 if (agent[i].status.busy) {
226 pmcd_trace(TR_RECV_TIMEOUT, agent[i].outFd, PDU_ERROR, 0);
227 CleanupAgent(&agent[i], AT_COMM, agent[i].inFd);
228 }
229 }
230 sts = PM_ERR_IPC;
231 break;
232 }
233 else if (sts < 0) {
234 /* this is not expected to happen! */
235 __pmNotifyErr(LOG_ERR, "DoStore: fatal select failure: %s\n",
236 netstrerror());
237 Shutdown();
238 exit(1);
239 }
240 }
241
242 for (i = 0; i < nAgents; i++) {
243 ap = &agent[i];
244 if (!ap->status.busy || !FD_ISSET(ap->outFd, &readyFds))
245 continue;
246 ap->status.busy = 0;
247 FD_CLR(ap->outFd, &waitFds);
248 nWait--;
249 s = __pmGetPDU(ap->outFd, ANY_SIZE, _pmcd_timeout, &pb);
250 if (s > 0 && _pmcd_trace_mask)
251 pmcd_trace(TR_RECV_PDU, ap->outFd, s, (int)((__psint_t)pb & 0xffffffff));
252 if (s == PDU_ERROR) {
253 int ss;
254 if ((ss = __pmDecodeError(pb, &s)) < 0)
255 sts = ss;
256 else {
257 if (s < 0) {
258 extern int CheckError(AgentInfo *, int);
259
260 sts = CheckError(ap, s);
261 pmcd_trace(TR_RECV_ERR, ap->outFd, PDU_RESULT, sts);
262 }
263 }
264 }
265 else {
266 /* Agent protocol error */
267 if (s < 0)
268 pmcd_trace(TR_RECV_ERR, ap->outFd, PDU_RESULT, s);
269 else
270 pmcd_trace(TR_WRONG_PDU, ap->outFd, PDU_ERROR, s);
271 sts = PM_ERR_IPC;
272 }
273
274 if (ap->ipcType != AGENT_DSO &&
275 (sts == PM_ERR_IPC || sts == PM_ERR_TIMEOUT))
276 CleanupAgent(ap, AT_COMM, ap->outFd);
277 }
278 }
279
280 /* Only one error code can be returned, so "no agent" or "not
281 * ready" errors have precedence over all except IPC and TIMEOUT
282 * protocol failures.
283 * Note that we make only a weak effort to return the most
284 * appropriate error status because clients interested in the
285 * outcome should be using pmStore on individual metric/instances
286 * if the outcome is important. In particular, in multi-agent
287 * stores, an earlier PM_ERR_IPC error can be "overwritten" by a
288 * subsequent less serious error.
289 */
290 if (sts != PM_ERR_IPC && sts != PM_ERR_TIMEOUT) {
291 if (badStore) {
292 sts = PM_ERR_NOAGENT;
293 }
294 else if (notReady) {
295 sts = PM_ERR_AGAIN;
296 }
297 }
298
299 if (sts >= 0) {
300 /* send PDU_ERROR, even if result was 0 */
301 int s;
302 if (_pmcd_trace_mask)
303 pmcd_trace(TR_XMIT_PDU, cp->fd, PDU_ERROR, 0);
304 s = __pmSendError(cp->fd, FROM_ANON, 0);
305 if (s < 0)
306 CleanupClient(cp, s);
307 }
308
309 pmFreeResult(result);
310 i = 0;
311 do {
312 s = dResult[i]->numpmid;
313 free(dResult[i]);
314 i++;
315 } while (s); /* numpmid == 0 terminates list */
316 free(dResult);
317
318 return sts;
319 }